In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
''Pract1 - . To use PCA Algorithm for dimensionality reduction.
You have a dataset that includes measurements for different variables on wine
(alcohol, ash, magnesium, and so on). Apply PCA algorithm & transform this data
so that most variations in the measurements of the variables are captured by a small
number of principal components so that it is easier to distinguish between red and
white wine by inspecting these principal components''
In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df= pd.read_csv('Wine.csv')
In [7]:
df.describe()
Out[7]:
| Alcohol | Malic_Acid | Ash | Ash_Alcanity | Magnesium | Total_Phenols | Flavanoids | Nonflavanoid_Phenols | Proanthocyanins | Color_Intensity | Hue | OD280 | Proline | Customer_Segment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 |
| mean | 13.000618 | 2.336348 | 2.366517 | 19.494944 | 99.741573 | 2.295112 | 2.029270 | 0.361854 | 1.590899 | 5.058090 | 0.957449 | 2.611685 | 746.893258 | 1.938202 |
| std | 0.811827 | 1.117146 | 0.274344 | 3.339564 | 14.282484 | 0.625851 | 0.998859 | 0.124453 | 0.572359 | 2.318286 | 0.228572 | 0.709990 | 314.907474 | 0.775035 |
| min | 11.030000 | 0.740000 | 1.360000 | 10.600000 | 70.000000 | 0.980000 | 0.340000 | 0.130000 | 0.410000 | 1.280000 | 0.480000 | 1.270000 | 278.000000 | 1.000000 |
| 25% | 12.362500 | 1.602500 | 2.210000 | 17.200000 | 88.000000 | 1.742500 | 1.205000 | 0.270000 | 1.250000 | 3.220000 | 0.782500 | 1.937500 | 500.500000 | 1.000000 |
| 50% | 13.050000 | 1.865000 | 2.360000 | 19.500000 | 98.000000 | 2.355000 | 2.135000 | 0.340000 | 1.555000 | 4.690000 | 0.965000 | 2.780000 | 673.500000 | 2.000000 |
| 75% | 13.677500 | 3.082500 | 2.557500 | 21.500000 | 107.000000 | 2.800000 | 2.875000 | 0.437500 | 1.950000 | 6.200000 | 1.120000 | 3.170000 | 985.000000 | 3.000000 |
| max | 14.830000 | 5.800000 | 3.230000 | 30.000000 | 162.000000 | 3.880000 | 5.080000 | 0.660000 | 3.580000 | 13.000000 | 1.710000 | 4.000000 | 1680.000000 | 3.000000 |
In [8]:
df.head()
Out[8]:
| Alcohol | Malic_Acid | Ash | Ash_Alcanity | Magnesium | Total_Phenols | Flavanoids | Nonflavanoid_Phenols | Proanthocyanins | Color_Intensity | Hue | OD280 | Proline | Customer_Segment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 1 |
| 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 | 1 |
| 2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 | 1 |
| 3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 | 1 |
| 4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 1 |
In [12]:
df.columns
Out[12]:
Index(['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium',
'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols',
'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline',
'Customer_Segment'],
dtype='object')
In [14]:
# Separate features (X) and target (y)
X = df.drop(columns=['Customer_Segment']) # Drop the target column
y = df['Customer_Segment'] # Set the target variable
In [15]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [16]:
# Apply PCA and reduce dimensions to 2 principal components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
In [17]:
# Create a DataFrame with the principal components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['Customer_Segment'] = y
In [18]:
# Print explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)
Explained variance ratio: [0.36198848 0.1920749 ]
In [19]:
# Visualize the PCA result
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Customer_Segment', palette='Set1')
plt.title('PCA of Wine Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
In [ ]:
In [ ]:
#OR
In [20]:
import pandas as pd
from sklearn.datasets import load_wine
# Load the wine dataset
wine = load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df['wine_type'] = wine.target # 0, 1, or 2 corresponding to different types of wine
# Inspect the dataset
print(wine_df.head())
from sklearn.preprocessing import StandardScaler
# Separate features and target
X = wine_df.drop(columns=['wine_type'])
y = wine_df['wine_type']
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.decomposition import PCA
# Apply PCA
pca = PCA(n_components=2) # Reduce to 2 principal components
X_pca = pca.fit_transform(X_scaled)
# Create a DataFrame with the two principal components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['wine_type'] = y
import matplotlib.pyplot as plt
import seaborn as sns
# Plot the principal components
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='wine_type', palette='Set1')
plt.title('PCA of Wine Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \ 0 14.23 1.71 2.43 15.6 127.0 2.80 1 13.20 1.78 2.14 11.2 100.0 2.65 2 13.16 2.36 2.67 18.6 101.0 2.80 3 14.37 1.95 2.50 16.8 113.0 3.85 4 13.24 2.59 2.87 21.0 118.0 2.80 flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \ 0 3.06 0.28 2.29 5.64 1.04 1 2.76 0.26 1.28 4.38 1.05 2 3.24 0.30 2.81 5.68 1.03 3 3.49 0.24 2.18 7.80 0.86 4 2.69 0.39 1.82 4.32 1.04 od280/od315_of_diluted_wines proline wine_type 0 3.92 1065.0 0 1 3.40 1050.0 0 2 3.17 1185.0 0 3 3.45 1480.0 0 4 2.93 735.0 0
In [ ]:
#OR
In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Load your wine dataset
df = pd.read_csv('Wine.csv')
# Separating features and target variable
X = df.drop(columns=['Customer_Segment']) # Features (measurements)
y = df['Customer_Segment'] # Target variable (Type of wine)
# Scatter plot before applying PCA
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1]['Alcohol'], X[y == 1]['Malic_Acid'], label='Customer Segment 1', alpha=0.7)
plt.scatter(X[y == 2]['Alcohol'], X[y == 2]['Malic_Acid'], label='Customer Segment 2', alpha=0.7)
plt.scatter(X[y == 3]['Alcohol'], X[y == 3]['Malic_Acid'], label='Customer Segment 3', alpha=0.7)
plt.xlabel('Alcohol')
plt.ylabel('Malic Acid')
plt.legend()
plt.title('Scatter Plot (Original Data)')
plt.show()
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Create a new DataFrame with the first two principal components
pca_df = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Customer_Segment'] = y
# Visualize the data using the first two principal components
plt.figure(figsize=(10, 6))
plt.scatter(pca_df[pca_df['Customer_Segment'] == 1]['Principal Component 1'], pca_df[pca_df['Customer_Segment'] == 1]['Principal Component 2'], label='Customer Segment 1', alpha=0.7)
plt.scatter(pca_df[pca_df['Customer_Segment'] == 2]['Principal Component 1'], pca_df[pca_df['Customer_Segment'] == 2]['Principal Component 2'], label='Customer Segment 2', alpha=0.7)
plt.scatter(pca_df[pca_df['Customer_Segment'] == 3]['Principal Component 1'], pca_df[pca_df['Customer_Segment'] == 3]['Principal Component 2'], label='Customer Segment 3', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA: Wine Dataset')
plt.show()
In [ ]:
#OR
In [4]:
'''Determining Principle Component Analysis for Wine Dataset'''
# Importing Preliminary Libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
#from sklearn import linear_model.fit
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis
!pip install factor_analyzer
from factor_analyzer import FactorAnalyzer
!pip install factor_analyzer
df = pd.read_csv('Wine.csv')
'''Determing only first 5 values '''
df.head()
# Taking only the dependent value from the dataset
df2 = df[['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium', 'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols', 'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline']]
df2.head()
#PCA cluster plot for Wine Dataset
#Importing libraries from SKLEARN
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
#loading Wine dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target
target_names = wine.target_names
pca = PCA(n_components=13)
wine_X = pca.fit(X).transform(X)
#1st Plot
plt.scatter(wine_X[y == 0, 3], wine_X[y == 0, 0], s =80, c = 'orange', label = 'Type 0')
plt.scatter(wine_X[y == 1, 3], wine_X[y == 1, 0], s =80, c = 'yellow', label = 'Type 1')
plt.scatter(wine_X[y == 2, 3], wine_X[y == 2, 0], s =80, c = 'green', label = 'Type 2')
plt.title('PCA plot for Wine Dataset')
plt.legend()
#2nd Plot
import pandas as pd
wine_dataframe = pd.DataFrame(wine_X, columns=wine.feature_names)
# Create a scatter matrix from the dataframe, color by y_train
grr = pd.plotting.scatter_matrix(wine_dataframe, c=y, figsize=(15, 15), marker='o',
hist_kwds={'bins': 20}, s=60, alpha=.8)
'''KNN classifier which is a type of supervised Machine Learning Technique.
This is used to detect the accuracy and classification of the given dataset'''
# Importing Libraries for Modelling.
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# Assigning values of X and y from dataset
X, y = wine.data[:, :], wine.target
''' Here X is assigned as all the column data(SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm) and
y is assigned as Species value(Iris-setosa, Iris-versicolour, Iris-virginica) ))'''
#Setting training and testing values
Xtrain, Xtest, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.StandardScaler().fit(Xtrain)
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)
# Modeling is done using KNN classifiers.
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtrain, y_train)
y_pred = knn.predict(Xtest)
# Display the Output
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion matrix \n', confusion_matrix(y_test, y_pred))
print('Classification \n', classification_report(y_test, y_pred))
from sklearn.metrics import cohen_kappa_score
cluster = cohen_kappa_score(y_test, y_pred)
cluster
X, y = wine.data[:, :], wine.target
Xtrain, Xtest, y_train, y_test = train_test_split(X, y)
#Logistic Regression Accuracy
#Logistic Regression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(Xtrain,y_train)
y_pred = classifier.predict(Xtest)
cm = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print("Logistic Regression :")
print("Accuracy = ", accuracy)
print(cm)
#Cohen Kappa Accuracy for LR
from sklearn.metrics import cohen_kappa_score
cluster = cohen_kappa_score(y_test, y_pred)
cluster
Collecting factor_analyzer
Downloading factor_analyzer-0.5.1.tar.gz (42 kB)
---------------------------------------- 0.0/42.8 kB ? eta -:--:--
--------- ------------------------------ 10.2/42.8 kB ? eta -:--:--
--------------------------- ---------- 30.7/42.8 kB 325.1 kB/s eta 0:00:01
-------------------------------------- 42.8/42.8 kB 417.7 kB/s eta 0:00:00
Installing build dependencies: started
Installing build dependencies: finished with status 'done'
Getting requirements to build wheel: started
Getting requirements to build wheel: finished with status 'done'
Preparing metadata (pyproject.toml): started
Preparing metadata (pyproject.toml): finished with status 'done'
Requirement already satisfied: pandas in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (2.1.4)
Requirement already satisfied: scipy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.11.4)
Requirement already satisfied: numpy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.26.3)
Requirement already satisfied: scikit-learn in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.4.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (3.2.0)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.8.2->pandas->factor_analyzer) (1.16.0)
Building wheels for collected packages: factor_analyzer
Building wheel for factor_analyzer (pyproject.toml): started
Building wheel for factor_analyzer (pyproject.toml): finished with status 'done'
Created wheel for factor_analyzer: filename=factor_analyzer-0.5.1-py2.py3-none-any.whl size=42623 sha256=0999ccdf3ec64fa15f9ccc4d8bde5e04051197c912b35434ee7ac2f459dc46c8
Stored in directory: c:\users\lenovo\appdata\local\pip\cache\wheels\a2\af\06\f4d4ed4d9d714fda437fb1583629417319603c2266e7b233cc
Successfully built factor_analyzer
Installing collected packages: factor_analyzer
Successfully installed factor_analyzer-0.5.1
[notice] A new release of pip is available: 24.0 -> 24.3.1 [notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: factor_analyzer in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (0.5.1) Requirement already satisfied: pandas in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (2.1.4) Requirement already satisfied: scipy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.11.4) Requirement already satisfied: numpy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.26.3) Requirement already satisfied: scikit-learn in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.4.2) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.4) Requirement already satisfied: joblib>=1.2.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (3.2.0) Requirement already satisfied: six>=1.5 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.8.2->pandas->factor_analyzer) (1.16.0)
[notice] A new release of pip is available: 24.0 -> 24.3.1 [notice] To update, run: python.exe -m pip install --upgrade pip
Accuracy Score: 0.9777777777777777
Confusion matrix
[[17 0 0]
[ 0 15 0]
[ 0 1 12]]
Classification
precision recall f1-score support
0 1.00 1.00 1.00 17
1 0.94 1.00 0.97 15
2 1.00 0.92 0.96 13
accuracy 0.98 45
macro avg 0.98 0.97 0.98 45
weighted avg 0.98 0.98 0.98 45
Logistic Regression :
Accuracy = 0.9333333333333333
[[14 1 0]
[ 0 16 2]
[ 0 0 12]]
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[4]:
0.8993288590604027
In [ ]:
In [ ]:
#OR
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline
# Importing Dataset
df = pd.read_csv('Wine.csv')
df.head(10)
# Describe the dataset using graph - Plot 1
df.iloc[:, 1:].describe()
for c in df.columns[1:]:
df.boxplot(c, by='Customer_Segment', figsize=(7, 4), fontsize=14)
plt.title("{}\n".format(c), fontsize=16)
plt.xlabel("Wine Customer Segment", fontsize=16)
# Using Standard Scaler
scaler = StandardScaler()
X = df.drop('Customer_Segment', axis=1)
y = df['Customer_Segment']
X = scaler.fit_transform(X)
dfx = pd.DataFrame(data=X, columns=df.columns[1:])
dfx.head(10)
# Plot 2
dfx.describe()
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
dfx_pca = pca.fit(dfx)
plt.figure(figsize=(10, 6))
plt.scatter(x=[i + 1 for i in range(len(dfx_pca.explained_variance_ratio_))],
y=dfx_pca.explained_variance_ratio_,
s=200, alpha=0.75, c='orange', edgecolor='k')
plt.grid(True)
plt.title("Explained variance ratio of the \nfitted principal component vector\n", fontsize=25)
plt.xlabel("Principal components", fontsize=15)
plt.xticks([i + 1 for i in range(len(dfx_pca.explained_variance_ratio_))], fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Explained variance ratio", fontsize=15)
plt.show()
# Transform
dfx_trans = pca.transform(dfx)
dfx_trans = pd.DataFrame(data=dfx_trans)
dfx_trans.head(10)
# Plot 3
plt.figure(figsize=(10, 6))
plt.scatter(dfx_trans[0], dfx_trans[1], c=df['Customer_Segment'], edgecolors='k', alpha=0.75, s=150)
plt.grid(True)
plt.title("Customer Segment separation using first two principal components\n", fontsize=20)
plt.xlabel("Principal component-1", fontsize=15)
plt.ylabel("Principal component-2", fontsize=15)
plt.show()
In [ ]:
In [ ]:
#OR
'''B. Apply LDA Algorithm on Iris Dataset and classify which species a given flower
belongs to.
Dataset Link:https://www.kaggle.com/datasets/uciml/iri
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris
%matplotlib inline
# Load the Iris dataset
iris = load_iris()
X = iris.data # Features: Sepal length, Sepal width, Petal length, Petal width
y = iris.target # Target: Species
# Convert the dataset into a DataFrame for easy visualization and manipulation
df = pd.DataFrame(data=X, columns=iris.feature_names)
df['Species'] = y
df['Species'] = df['Species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head()
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize the LDA model
lda = LinearDiscriminantAnalysis()
# Fit the model to the training data
lda.fit(X_train, y_train)
# Predict the species for the test set
y_pred = lda.predict(X_test)
# Accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))
# Plot the LDA-transformed features to visualize class separability
X_lda = lda.transform(X)
plt.figure(figsize=(10, 6))
colors = ['red', 'green', 'blue']
for i, color, target_name in zip([0, 1, 2], colors, iris.target_names):
plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=0.8, color=color, label=target_name)
plt.xlabel("LDA Component 1")
plt.ylabel("LDA Component 2")
plt.legend(loc='best')
plt.title("LDA Projection of Iris Dataset")
plt.show()
Accuracy: 100.00%
Confusion Matrix:
[[19 0 0]
[ 0 13 0]
[ 0 0 13]]
Classification Report:
precision recall f1-score support
setosa 1.00 1.00 1.00 19
versicolor 1.00 1.00 1.00 13
virginica 1.00 1.00 1.00 13
accuracy 1.00 45
macro avg 1.00 1.00 1.00 45
weighted avg 1.00 1.00 1.00 45
In [ ]:
In [ ]:
#OR
In [ ]:
In [2]:
# importing required packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import datasets
import matplotlib.pyplot as plt
# load iris dataset
iris=datasets.load_iris()
# convert dataset into a pandas dataframe
df = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
columns = iris['feature_names'] + ['target'])
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.columns = ['s_length', 's_width', 'p_length', 'p_width', 'target', 'species']
df.head()
# fitting the model
X = df[['s_length', 's_width', 'p_length', 'p_width']]
y = df['species']
model = LinearDiscriminantAnalysis()
model.fit(X, y)
# evaluating the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))
X = iris.data
y = iris.target
model = LinearDiscriminantAnalysis()
data_plot = model.fit(X, y).transform(X)
target_names = iris.target_names
plt.figure()
colors = ['red', 'green', 'blue']
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(data_plot[y == i, 0], data_plot[y == i, 1], alpha=.8, color=color,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.show()
# define new observation
new = [5, 2, 1, .4]
# predict which class the new observation belongs to
model.predict([new])
0.9800000000000001
Out[2]:
array([0])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
''' Pract 2- B. Use the diabetes data set from UCI and Pima Indians Diabetes data set for performing
the following:
a. Univariate analysis: Frequency, Mean, Median, Mode, Variance, Standard
Deviation, Skewness and Kurtosis
b. Bivariate analysis: Linear and logistic regression modeling
c. Multiple Regression analysis
d. Also compare the results of the above analysis for the two data sets'''
In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
!pip install statsmodels
import statsmodels.api as sm
# Load the dataset (make sure the file is in your working directory)
df = pd.read_csv('diabetes.csv')
# Display first few rows of the dataset
df.head()
#part 1- univarate analysis
# Define a function for univariate statistics
def univariate_analysis(df):
univariate_stats = {
'Feature': [],
'Mean': [],
'Median': [],
'Mode': [],
'Variance': [],
'Standard Deviation': [],
'Skewness': [],
'Kurtosis': []
}
for column in df.columns:
if np.issubdtype(df[column].dtype, np.number):
univariate_stats['Feature'].append(column)
univariate_stats['Mean'].append(df[column].mean())
univariate_stats['Median'].append(df[column].median())
univariate_stats['Mode'].append(df[column].mode()[0])
univariate_stats['Variance'].append(df[column].var())
univariate_stats['Standard Deviation'].append(df[column].std())
univariate_stats['Skewness'].append(df[column].skew())
univariate_stats['Kurtosis'].append(df[column].kurt())
return pd.DataFrame(univariate_stats)
# Univariate analysis for the dataset
univariate_results = univariate_analysis(df)
print("Univariate Analysis Results:\n", univariate_results)
#part 2 - bivarite analysis
# Linear regression: Predicting BloodPressure from BMI
X = df[['BMI']] # predictor
y = df['BloodPressure'] # target
linear_model = LinearRegression()
linear_model.fit(X, y)
predictions = linear_model.predict(X)
# Plot the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual data')
plt.plot(X, predictions, color='red', label='Regression line')
plt.xlabel('BMI')
plt.ylabel('Blood Pressure')
plt.title('Linear Regression: Predicting Blood Pressure from BMI')
plt.legend()
plt.show()
# logistic regression
# Logistic regression: Predicting Outcome from Age
X_logistic = df[['Age']] # predictor
y_logistic = df['Outcome'] # target (0 or 1)
logistic_model = LogisticRegression()
logistic_model.fit(X_logistic, y_logistic)
logistic_pred = logistic_model.predict(X_logistic)
# Model summary
print("Logistic Regression Coefficients:\n", logistic_model.coef_)
print("Intercept:", logistic_model.intercept_)
# part 3 - multi regression analysis
# Multiple regression to predict Outcome using multiple predictors
X_multi = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y_multi = df['Outcome']
multi_logit_model = sm.Logit(y_multi, sm.add_constant(X_multi)).fit()
print(multi_logit_model.summary())
Collecting statsmodels
Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Requirement already satisfied: numpy<3,>=1.22.3 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (1.26.3)
Requirement already satisfied: scipy!=1.9.2,>=1.8 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (1.11.4)
Requirement already satisfied: pandas!=2.1.0,>=1.4 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (2.1.4)
Collecting patsy>=0.5.6 (from statsmodels)
Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Requirement already satisfied: packaging>=21.3 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (23.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2023.4)
Requirement already satisfied: six in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from patsy>=0.5.6->statsmodels) (1.16.0)
Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl (9.8 MB)
---------------------------------------- 0.0/9.8 MB ? eta -:--:--
---------------------------------------- 0.0/9.8 MB 682.7 kB/s eta 0:00:15
--------------------------------------- 0.1/9.8 MB 1.6 MB/s eta 0:00:07
-- ------------------------------------- 0.7/9.8 MB 5.7 MB/s eta 0:00:02
------- -------------------------------- 1.7/9.8 MB 11.0 MB/s eta 0:00:01
-------- ------------------------------- 2.0/9.8 MB 9.7 MB/s eta 0:00:01
--------- ------------------------------ 2.3/9.8 MB 9.0 MB/s eta 0:00:01
---------- ----------------------------- 2.7/9.8 MB 9.0 MB/s eta 0:00:01
----------- ---------------------------- 2.9/9.8 MB 8.5 MB/s eta 0:00:01
------------ --------------------------- 3.2/9.8 MB 8.1 MB/s eta 0:00:01
-------------- ------------------------- 3.5/9.8 MB 8.0 MB/s eta 0:00:01
--------------- ------------------------ 3.8/9.8 MB 7.8 MB/s eta 0:00:01
---------------- ----------------------- 4.1/9.8 MB 7.7 MB/s eta 0:00:01
----------------- ---------------------- 4.3/9.8 MB 7.4 MB/s eta 0:00:01
----------------- ---------------------- 4.4/9.8 MB 7.0 MB/s eta 0:00:01
------------------ --------------------- 4.5/9.8 MB 6.7 MB/s eta 0:00:01
------------------ --------------------- 4.6/9.8 MB 6.4 MB/s eta 0:00:01
------------------- -------------------- 4.7/9.8 MB 6.2 MB/s eta 0:00:01
------------------- -------------------- 4.9/9.8 MB 6.0 MB/s eta 0:00:01
-------------------- ------------------- 5.0/9.8 MB 5.8 MB/s eta 0:00:01
--------------------- ------------------ 5.2/9.8 MB 5.7 MB/s eta 0:00:01
--------------------- ------------------ 5.3/9.8 MB 5.6 MB/s eta 0:00:01
---------------------- ----------------- 5.5/9.8 MB 5.5 MB/s eta 0:00:01
----------------------- ---------------- 5.7/9.8 MB 5.4 MB/s eta 0:00:01
----------------------- ---------------- 5.8/9.8 MB 5.3 MB/s eta 0:00:01
------------------------ --------------- 6.0/9.8 MB 5.3 MB/s eta 0:00:01
------------------------- -------------- 6.2/9.8 MB 5.2 MB/s eta 0:00:01
-------------------------- ------------- 6.4/9.8 MB 5.2 MB/s eta 0:00:01
--------------------------- ------------ 6.7/9.8 MB 5.2 MB/s eta 0:00:01
---------------------------- ----------- 6.9/9.8 MB 5.2 MB/s eta 0:00:01
----------------------------- ---------- 7.2/9.8 MB 5.3 MB/s eta 0:00:01
------------------------------ --------- 7.5/9.8 MB 5.3 MB/s eta 0:00:01
------------------------------- -------- 7.9/9.8 MB 5.3 MB/s eta 0:00:01
--------------------------------- ------ 8.2/9.8 MB 5.4 MB/s eta 0:00:01
---------------------------------- ----- 8.6/9.8 MB 5.5 MB/s eta 0:00:01
------------------------------------ --- 8.9/9.8 MB 5.5 MB/s eta 0:00:01
------------------------------------- -- 9.1/9.8 MB 5.6 MB/s eta 0:00:01
------------------------------------- -- 9.1/9.8 MB 5.6 MB/s eta 0:00:01
------------------------------------- -- 9.3/9.8 MB 5.3 MB/s eta 0:00:01
--------------------------------------- 9.6/9.8 MB 5.4 MB/s eta 0:00:01
--------------------------------------- 9.8/9.8 MB 5.4 MB/s eta 0:00:01
---------------------------------------- 9.8/9.8 MB 5.3 MB/s eta 0:00:00
Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
---------------------------------------- 0.0/233.9 kB ? eta -:--:--
--------------- ------------------------ 92.2/233.9 kB 5.1 MB/s eta 0:00:01
---------------------------------------- 233.9/233.9 kB 2.9 MB/s eta 0:00:00
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.4
[notice] A new release of pip is available: 24.0 -> 24.3.1 [notice] To update, run: python.exe -m pip install --upgrade pip
Univariate Analysis Results:
Feature Mean Median Mode Variance \
0 Pregnancies 3.845052 3.0000 1.000 11.354056
1 Glucose 120.894531 117.0000 99.000 1022.248314
2 BloodPressure 69.105469 72.0000 70.000 374.647271
3 SkinThickness 20.536458 23.0000 0.000 254.473245
4 Insulin 79.799479 30.5000 0.000 13281.180078
5 BMI 31.992578 32.0000 32.000 62.159984
6 DiabetesPedigreeFunction 0.471876 0.3725 0.254 0.109779
7 Age 33.240885 29.0000 22.000 138.303046
8 Outcome 0.348958 0.0000 0.000 0.227483
Standard Deviation Skewness Kurtosis
0 3.369578 0.901674 0.159220
1 31.972618 0.173754 0.640780
2 19.355807 -1.843608 5.180157
3 15.952218 0.109372 -0.520072
4 115.244002 2.272251 7.214260
5 7.884160 -0.428982 3.290443
6 0.331329 1.919911 5.594954
7 11.760232 1.129597 0.643159
8 0.476951 0.635017 -1.600930
Logistic Regression Coefficients:
[[0.04202455]]
Intercept: [-2.04744807]
Optimization terminated successfully.
Current function value: 0.470993
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: Outcome No. Observations: 768
Model: Logit Df Residuals: 759
Method: MLE Df Model: 8
Date: Sat, 02 Nov 2024 Pseudo R-squ.: 0.2718
Time: 00:04:47 Log-Likelihood: -361.72
converged: True LL-Null: -496.74
Covariance Type: nonrobust LLR p-value: 9.652e-54
============================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------
const -8.4047 0.717 -11.728 0.000 -9.809 -7.000
Pregnancies 0.1232 0.032 3.840 0.000 0.060 0.186
Glucose 0.0352 0.004 9.481 0.000 0.028 0.042
BloodPressure -0.0133 0.005 -2.540 0.011 -0.024 -0.003
SkinThickness 0.0006 0.007 0.090 0.929 -0.013 0.014
Insulin -0.0012 0.001 -1.322 0.186 -0.003 0.001
BMI 0.0897 0.015 5.945 0.000 0.060 0.119
DiabetesPedigreeFunction 0.9452 0.299 3.160 0.002 0.359 1.531
Age 0.0149 0.009 1.593 0.111 -0.003 0.033
============================================================================================
In [ ]:
In [ ]:
# OR
'''
A. Predict the price of the Uber ride from a given pickup point to the agreed drop-off
location. Perform following tasks:
1. Pre-process the dataset.
2. Identify outliers.
3. Check the correlation.
4. Implement linear regression and ridge, Lasso regression models.
5. Evaluate the models and compare their respective scores like R2, RMSE, etc.
Dataset link: https://www.kaggle.com/datasets/yasserh/uber-fares-datase
In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
# Load the dataset
df = pd.read_csv("uber.csv")
# view dataset
df.head()
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# print(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
# print(df['hour'])
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
# print(df['day_of_week'])
# check datasets for more columns we added 'hour' and 'day_of_week' column
df.head()
# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'key', 'pickup_datetime'])
# check datasets for removal of columns we removed 'first_column with no name', 'key' and 'pickup_datetime' column
df.head()
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
# Split the data into features (X) and target (y)
X = df_imputed.drop(columns=['fare_amount']) # create new dataset ignoring 'fare_amount' column
y = df_imputed['fare_amount'] # create a series of only 'fare_amount' column
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize the features (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Implement Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
# Implement Ridge Regression
ridge_model = Ridge(alpha=1.0) # You can experiment with different alpha values
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)
# Implement Lasso Regression
lasso_model = Lasso(alpha=0.1) # You can experiment with different alpha values
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)
# Evaluate the models
def evaluate_model(y_true, y_pred, model_name):
r2 = r2_score(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"{model_name} - R2 Score: {r2:.4f}, RMSE: {rmse:.2f}")
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_ridge, "Ridge Regression")
evaluate_model(y_test, y_pred_lasso, "Lasso Regression")
Linear Regression - R2 Score: 0.0007, RMSE: 10.31 Ridge Regression - R2 Score: 0.0007, RMSE: 10.31 Lasso Regression - R2 Score: 0.0003, RMSE: 10.31
In [ ]:
In [ ]:
#OR
In [11]:
# Load the required packages
import numpy as np
import pandas as pd
from pandas import read_csv
# Specify the file name
filename = 'diabetes.csv'
# Read the data
data = read_csv(filename)
# Print the shape
data.shape
# Print the first few rows
data.head()
# Show the type of 'data'
type(data)
# Get the column names
col_idx = data.columns
col_idx
# Get row indices
row_idx = data.index
print(row_idx)
# Find data type for each attribute
print("Data type of each attribute:")
data.dtypes
# Generate statistical summary
description = data.describe()
print("Statistical summary of the data:\n")
description
#Therefore, there are a total of 768 entries in the dataset. The outcome variable is set to 1 for 268 entries, and the rest are set to 0.
class_counts = data.groupby('Outcome').size()
print("Class breakdown of the data:\n")
print(class_counts)
# Compute correlation matrix
correlations = data.corr(method = 'pearson')
print("Correlations of attributes in the data:\n")
correlations
skew = data.skew()
print("Skew of attribute distributions in the data:\n")
skew
#histogram
# Import required package
from matplotlib import pyplot
pyplot.rcParams['figure.figsize'] = [20, 10]; # set the figure size
# Draw histograms for all attributes
data.hist()
pyplot.show()
# Density plots for all attributes
data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
pyplot.show()
#box plot
# Draw box and whisker plots for all attributes
data.plot(kind= 'box', subplots=True, layout=(3,3), sharex=False, sharey=False)
pyplot.show()
#multivarate plots
# Compute the correlation matrix
correlations = data.corr(method = 'pearson') # Correlations between all pairs of attributes
# Print the datatype
type(correlations)
# Show the correlation matrix
correlations
# import required package
import numpy as np
# plot correlation matrix
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = data.columns
ax.set_xticklabels(names,rotation=90) # Rotate x-tick labels by 90 degrees
ax.set_yticklabels(names)
pyplot.show()
# Import required package
from pandas.plotting import scatter_matrix
pyplot.rcParams['figure.figsize'] = [20, 20]
# Plotting Scatterplot Matrix
scatter_matrix(data)
pyplot.show()
RangeIndex(start=0, stop=768, step=1) Data type of each attribute: Statistical summary of the data: Class breakdown of the data: Outcome 0 500 1 268 dtype: int64 Correlations of attributes in the data: Skew of attribute distributions in the data:
In [ ]:
In [ ]:
''' Pract 3 Classification Analysis (Any one)
A. Implementation of Support Vector Machines (SVM) for classifying images of handwritten digits into their respective numerical classes (0 to 9)
In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
# Load the digits dataset
digits = datasets.load_digits()
# Split the data into features (X) and labels (y)
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create an SVM classifier (linear kernel)
clf = svm.SVC(kernel='linear')
# Fit the classifier on the training data
clf.fit(X_train, y_train)
#predict the test data
y_pred = clf.predict(X_test)
# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy : ", accuracy)
# Confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(confusion_matrix)
# Classification report
classification_report = metrics.classification_report(y_test, y_pred)
print("Classification Report : ")
print(classification_report)
# Visualize some of the test images and their predicted labels
plt.figure(figsize=(15, 8))
for i in range(10):
plt.subplot(5, 5, i + 1)
plt.imshow(X_test[i].reshape(8, 8), cmap=plt.cm.gray_r)
plt.title(f"Predicted : {y_pred[i]}, Actual : {y_test[i]}")
plt.axis('on')
Accuracy : 0.9777777777777777
Confusion Matrix :
[[33 0 0 0 0 0 0 0 0 0]
[ 0 28 0 0 0 0 0 0 0 0]
[ 0 0 33 0 0 0 0 0 0 0]
[ 0 0 0 32 0 1 0 0 0 1]
[ 0 1 0 0 45 0 0 0 0 0]
[ 0 0 0 0 0 47 0 0 0 0]
[ 0 0 0 0 0 0 35 0 0 0]
[ 0 0 0 0 0 0 0 33 0 1]
[ 0 0 0 0 0 1 0 0 29 0]
[ 0 0 0 1 1 0 0 1 0 37]]
Classification Report :
precision recall f1-score support
0 1.00 1.00 1.00 33
1 0.97 1.00 0.98 28
2 1.00 1.00 1.00 33
3 0.97 0.94 0.96 34
4 0.98 0.98 0.98 46
5 0.96 1.00 0.98 47
6 1.00 1.00 1.00 35
7 0.97 0.97 0.97 34
8 1.00 0.97 0.98 30
9 0.95 0.93 0.94 40
accuracy 0.98 360
macro avg 0.98 0.98 0.98 360
weighted avg 0.98 0.98 0.98 360
In [ ]:
In [ ]:
#OR
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
%matplotlib inline
# Load the digits dataset
digits = datasets.load_digits()
# Print dataset information
print("Digits dataset keys:", digits.keys())
print("Image data shape:", digits.images.shape)
print("Labels shape:", digits.target.shape)
# Display some sample images with their labels
fig, axes = plt.subplots(1, 5, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):
ax.set_axis_off()
ax.imshow(image, cmap='gray')
ax.set_title(f"Label: {label}")
plt.show()
# Flatten the images for the SVM model
X = digits.images.reshape((len(digits.images), -1))
y = digits.target
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize the SVM model with a linear kernel
svm_model = SVC(kernel='linear', C=1.0)
# Train the model
svm_model.fit(X_train, y_train)
# Predict on the test set
y_pred = svm_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
# Display the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))
# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# Visualize the confusion matrix
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()
Digits dataset keys: dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR']) Image data shape: (1797, 8, 8) Labels shape: (1797,)
Model Accuracy: 97.78%
Classification Report:
precision recall f1-score support
0 0.98 1.00 0.99 53
1 0.98 0.98 0.98 50
2 0.98 1.00 0.99 47
3 0.96 0.96 0.96 54
4 1.00 1.00 1.00 60
5 0.97 0.95 0.96 66
6 0.98 0.98 0.98 53
7 1.00 0.98 0.99 55
8 0.95 0.95 0.95 43
9 0.97 0.97 0.97 59
accuracy 0.98 540
macro avg 0.98 0.98 0.98 540
weighted avg 0.98 0.98 0.98 540
Confusion Matrix:
[[53 0 0 0 0 0 0 0 0 0]
[ 0 49 0 0 0 0 0 0 1 0]
[ 0 0 47 0 0 0 0 0 0 0]
[ 0 0 1 52 0 1 0 0 0 0]
[ 0 0 0 0 60 0 0 0 0 0]
[ 0 0 0 1 0 63 1 0 0 1]
[ 1 0 0 0 0 0 52 0 0 0]
[ 0 0 0 0 0 0 0 54 0 1]
[ 0 1 0 0 0 1 0 0 41 0]
[ 0 0 0 1 0 0 0 0 1 57]]
In [ ]:
In [ ]:
'''Pract 3 B B. Implement K-Nearest Neighbours’ algorithm on Social network ad dataset. Compute
confusion matrix, accuracy, error rate, precision and recall on the given dataset.
In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import seaborn as sns
%matplotlib inline
# Load the dataset
df = pd.read_csv('Social_Network_Ads.csv')
# Display the first few rows of the dataset
print(df.head())
# Drop unnecessary columns
df = df.drop(['User ID', 'Gender'], axis=1)
# Define features and target variable
X = df[['Age', 'EstimatedSalary']].values
y = df['Purchased'].values
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize the KNN classifier with k=5 (you can tune this value)
knn = KNeighborsClassifier(n_neighbors=5)
# Train the model
knn.fit(X_train, y_train)
# Predict on the test set
y_pred = knn.predict(X_test)
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"Error Rate: {error_rate:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
User ID Gender Age EstimatedSalary Purchased 0 15624510 Male 19 19000 0 1 15810944 Male 35 20000 0 2 15668575 Female 26 43000 0 3 15603246 Female 27 57000 0 4 15804002 Male 19 76000 0
Accuracy: 0.93 Error Rate: 0.07 Precision: 0.91 Recall: 0.91
In [ ]:
In [ ]:
#OR
In [8]:
# K-Nearest Neighbors (k-NN)
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fitting the knn classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
# Predicting the Test set results
y_pred = knn.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.5,
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-Nearest Neighbors (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.9,
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-Nearest Neighbors (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
# As you can see in the gifure above, the incorrect predictions using K-NN model is 7 when compared to 11 incorrect predictions
#using Logistic regression. This concludes that though K-NN is not the best model for this prediction
#it is a better model compared to logistic regression. Cheers!
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_1368\1339922156.py:52: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.5,
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_1368\1339922156.py:71: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.9,
In [ ]:
In [ ]:
''' Pract 4 - A - Clustering Analysis (Any one)
A. Implement K-Means clustering on Iris.csv dataset. Determine the number of clusters
using the elbow method
In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
%matplotlib inline
# Load the dataset
df = pd.read_csv('Iris.csv')
# Display the first few rows of the dataset
print(df.head())
# Drop the 'Species' column as it is not needed for clustering
X = df.drop(['Species', 'Id'], axis=1)
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Calculate WCSS for different number of clusters
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='-', color='b')
plt.title('Elbow Method to Determine Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster-Sum-of-Squares (WCSS)')
plt.xticks(range(1, 11))
plt.grid(True)
plt.show()
# Apply K-Means with the chosen number of clusters
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)
# Add the cluster labels to the dataset
df['Cluster'] = y_kmeans
print(df.head())
# Plot the clusters
plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[y_kmeans == 0, 2], X_scaled[y_kmeans == 0, 3], s=100, c='red', label='Cluster 1')
plt.scatter(X_scaled[y_kmeans == 1, 2], X_scaled[y_kmeans == 1, 3], s=100, c='blue', label='Cluster 2')
plt.scatter(X_scaled[y_kmeans == 2, 2], X_scaled[y_kmeans == 2, 3], s=100, c='green', label='Cluster 3')
# Plot centroids
plt.scatter(kmeans.cluster_centers_[:, 2], kmeans.cluster_centers_[:, 3], s=300, c='yellow', marker='X', label='Centroids')
plt.title('Clusters of Iris Dataset')
plt.xlabel('Petal Length (Standardized)')
plt.ylabel('Petal Width (Standardized)')
plt.legend()
plt.grid(True)
plt.show()
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species 0 1 5.1 3.5 1.4 0.2 Iris-setosa 1 2 4.9 3.0 1.4 0.2 Iris-setosa 2 3 4.7 3.2 1.3 0.2 Iris-setosa 3 4 4.6 3.1 1.5 0.2 Iris-setosa 4 5 5.0 3.6 1.4 0.2 Iris-setosa
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species \ 0 1 5.1 3.5 1.4 0.2 Iris-setosa 1 2 4.9 3.0 1.4 0.2 Iris-setosa 2 3 4.7 3.2 1.3 0.2 Iris-setosa 3 4 4.6 3.1 1.5 0.2 Iris-setosa 4 5 5.0 3.6 1.4 0.2 Iris-setosa Cluster 0 1 1 2 2 2 3 2 4 1
In [ ]:
In [ ]:
#OR
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Load the Iris dataset
df = pd.read_csv("Iris.csv")
# Select features (attributes) for clustering (e.g., sepal_length, sepal_width, petal_length, petal_width)
X = df.iloc[:, 1:-1] # Exclude the first column (id) and the last column (species)
# Standardize the feature matrix (important for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Determine the optimal number of clusters using the elbow method
inertia = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
inertia.append(kmeans.inertia_)
print(inertia)
# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Within-cluster Sum of Squares)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.grid()
plt.show()
[600.0, 223.73200573676345, 192.03717409190028, 114.68221609937967, 91.29544474066981, 81.76026132860622, 80.98238131032987, 68.08623905064636, 59.38528882045366, 52.9899972101586]
In [ ]:
In [ ]:
#OR
In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Load the iris dataset
iris_df = sns.load_dataset("iris")
# Display value counts for species column
print(iris_df["species"].value_counts())
# Show the first few rows
print(iris_df.head())
# Heatmap of correlations
plt.figure(figsize=(13, 10))
correlation = iris_df.select_dtypes(include=['float64']).corr() # Only select numeric columns
sns.heatmap(correlation, annot=True, fmt='.2f', cmap="RdPu")
plt.title("Correlation Heatmap")
plt.show()
# Scatter plot of Sepal Length vs Sepal Width
plt.figure(figsize=(10, 7))
sns.set(style='whitegrid')
sns.scatterplot(x="sepal_length", y="sepal_width", hue='species', data=iris_df, palette="inferno")
plt.title("Scatter Plot of Sepal Length vs Sepal Width")
plt.show()
# Joint plot for Sepal Width vs Petal Width
sns.jointplot(x='petal_width', y='sepal_width', hue="species", palette="inferno", data=iris_df)
plt.show()
# Pair plot for all features
sns.pairplot(iris_df, hue="species", palette='gnuplot2')
plt.show()
# Prepare data for K-Means (use only numeric columns for clustering)
X = iris_df[['sepal_length', 'petal_width']].copy()
# Determine the optimal number of clusters using the Elbow Method
distances = []
for i in range(1, 7):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(X)
distances.append(kmeans.inertia_)
# Plotting the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 7), distances, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()
# Apply K-Means with the optimal number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3, random_state=42)
identified_clusters = kmeans.fit_predict(X)
# Adding the cluster data to the original dataset for plotting
data_with_clusters = X.copy()
data_with_clusters['Cluster'] = identified_clusters
# Scatter plot with clusters
plt.figure(figsize=(10, 10))
scatter = plt.scatter(data_with_clusters['sepal_length'],
data_with_clusters['petal_width'],
c=data_with_clusters['Cluster'], cmap='winter')
plt.title('Scatter Plot for Sepal Length and Petal Width with Clusters')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Width')
plt.legend(*scatter.legend_elements(), title='Cluster')
plt.show()
# Inference: From the figure, we can say that the optimal number of clusters is where the elbow occurs (K=3).
# With K=3, we can segment the data into 3 different clusters with minimal error.
species setosa 50 versicolor 50 virginica 50 Name: count, dtype: int64 sepal_length sepal_width petal_length petal_width species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa
In [13]:
# Same as above but with solved error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import preprocessing
# Load Iris dataset
iris_df = sns.load_dataset("iris")
# Encode the species labels as numeric values
label_encoder = preprocessing.LabelEncoder()
iris_df['species'] = label_encoder.fit_transform(iris_df['species'])
# Heatmap of correlations
plt.figure(figsize=(13, 10))
correlation = iris_df.corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap="RdPu")
plt.title("Correlation Heatmap")
plt.show()
# Scatter plot of Sepal Length vs Sepal Width
plt.figure(figsize=(10, 7))
sns.set(style='whitegrid')
sns.scatterplot(x="sepal_length", y="sepal_width", hue='species', data=iris_df, palette="inferno")
plt.title("Scatter Plot of Sepal Length vs Sepal Width")
plt.show()
# Joint plot for Sepal Width vs Petal Width
sns.jointplot(x='petal_width', y='sepal_width', hue="species", palette="inferno", data=iris_df)
plt.show()
# Pair plot for all features
sns.pairplot(iris_df, hue="species", palette='gnuplot2')
plt.show()
# Prepare data for K-Means
X = iris_df[['sepal_length', 'petal_width']]
# Determine the optimal number of clusters using the Elbow Method
distance = []
for i in range(1, 7):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(X)
distance.append(kmeans.inertia_)
# Plotting the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 7), distance, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()
# Apply K-Means with the optimal number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3, random_state=42)
identified_clusters = kmeans.fit_predict(X)
# Adding the cluster data to the original dataset for plotting
data_with_clusters = X.copy()
data_with_clusters['Cluster'] = identified_clusters
# Scatter plot with clusters
plt.figure(figsize=(10, 10))
scatter = plt.scatter(data_with_clusters['sepal_length'],
data_with_clusters['petal_width'],
c=data_with_clusters['Cluster'], cmap='winter')
plt.title('Scatter Plot for Sepal Length and Petal Width with Clusters')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Width')
plt.legend(*scatter.legend_elements(), title='Cluster')
plt.show()
In [ ]:
In [ ]:
'''Pract 4 - B - B. Implement K-Mediod Algorithm on a credit card dataset. Determine the number of
clusters using the Silhouette Method.
In [ ]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Load your dataset
data = pd.read_csv('creditcard.csv') # Update with your actual file path
print(data.head())
# Handle missing values: Drop rows with missing values
data_cleaned = data.dropna()
# Select only numeric features for clustering
numeric_data = data_cleaned.select_dtypes(include=[np.number])
# Standardize the numeric features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)
# K-Medoids Algorithm
def k_medoids(X, num_clusters, max_iterations=100):
m, n = X.shape
medoids_indices = random.sample(range(m), num_clusters)
medoids = X[medoids_indices]
for _ in range(max_iterations):
clusters = np.zeros(m)
# Assign clusters based on the nearest medoid
for i in range(m):
distances = np.linalg.norm(X[i] - medoids, axis=1)
clusters[i] = np.argmin(distances)
# Update medoids
new_medoids = np.zeros((num_clusters, n))
for i in range(num_clusters):
cluster_points = X[clusters == i]
if len(cluster_points) > 0:
distances_in_cluster = np.sum(np.linalg.norm(cluster_points[:, np.newaxis] - cluster_points, axis=2), axis=1)
new_medoids[i] = cluster_points[np.argmin(distances_in_cluster)]
else:
new_medoids[i] = medoids[i] # Keep the old medoid if the cluster is empty
medoids = new_medoids
return clusters, medoids
# Determine the optimal number of clusters using the Silhouette Method
silhouette_scores = []
range_n_clusters = range(2, 10) # Try different cluster sizes
optimal_n_clusters = 0
best_silhouette_score = -1
for n_clusters in range_n_clusters:
clusters, medoids = k_medoids(scaled_data, n_clusters)
silhouette_avg = silhouette_score(scaled_data, clusters)
silhouette_scores.append(silhouette_avg)
print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}")
# Track the optimal number of clusters
if silhouette_avg > best_silhouette_score:
best_silhouette_score = silhouette_avg
optimal_n_clusters = n_clusters
print(f"\nOptimal number of clusters based on silhouette score is: {optimal_n_clusters}")
# Plotting silhouette scores to visualize the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.title('Silhouette Scores for K-Medoids')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()
CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES \
0 C10001 40.900749 0.818182 95.40 0.00
1 C10002 3202.467416 0.909091 0.00 0.00
2 C10003 2495.148862 1.000000 773.17 773.17
3 C10004 1666.670542 0.636364 1499.00 1499.00
4 C10005 817.714335 1.000000 16.00 16.00
INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY \
0 95.4 0.000000 0.166667
1 0.0 6442.945483 0.000000
2 0.0 0.000000 1.000000
3 0.0 205.788017 0.083333
4 0.0 0.000000 0.083333
ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY \
0 0.000000 0.083333
1 0.000000 0.000000
2 1.000000 0.000000
3 0.083333 0.000000
4 0.083333 0.000000
CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT \
0 0.000000 0 2 1000.0
1 0.250000 4 0 7000.0
2 0.000000 0 12 7500.0
3 0.083333 1 1 7500.0
4 0.000000 0 1 1200.0
PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 201.802084 139.509787 0.000000 12
1 4103.032597 1072.340217 0.222222 12
2 622.066742 627.284787 0.000000 12
3 0.000000 NaN 0.000000 12
4 678.334763 244.791237 0.000000 12
In [1]:
!pip install pyclustering
Requirement already satisfied: pyclustering in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (0.10.1.2) Requirement already satisfied: scipy>=1.1.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (1.11.4) Requirement already satisfied: matplotlib>=3.0.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (3.8.2) Requirement already satisfied: numpy>=1.15.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (1.26.3) Requirement already satisfied: Pillow>=5.2.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (10.2.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (4.47.2) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (1.4.5) Requirement already satisfied: packaging>=20.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (23.2) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (3.1.1) Requirement already satisfied: python-dateutil>=2.7 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->pyclustering) (1.16.0)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
'''Pract 5 5 Ensemble Learning (Any one)
A. Implement Random Forest Classifier model to predict the safety of the car.
In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
# Load the dataset (replace with your actual file path)
data = pd.read_csv('car_safety.csv') # Update with your actual file path
# Print the first few rows of the dataset to understand its structure
print(data.head())
# Check the unique values in the target variable
print(data['unacc'].unique())
# Preprocess the data: handle missing values and encode categorical variables
data = data.dropna() # Drop rows with missing values, or you can use imputation
# Example: Encoding categorical variables
for column in data.select_dtypes(include=['object']).columns:
le = LabelEncoder()
data[column] = le.fit_transform(data[column])
# Define features and target variable
X = data.drop('unacc', axis=1) # Features
y = data['unacc'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(X_train, y_train)
# Make predictions
y_pred = rf_classifier.predict(X_test)
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
vhigh vhigh.1 2 2.1 small low unacc
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
4 vhigh vhigh 2 2 med high unacc
['unacc' 'acc' 'vgood' 'good']
Confusion Matrix:
[[108 1 7 2]
[ 1 12 0 4]
[ 1 0 360 0]
[ 2 0 0 21]]
Classification Report:
precision recall f1-score support
0 0.96 0.92 0.94 118
1 0.92 0.71 0.80 17
2 0.98 1.00 0.99 361
3 0.78 0.91 0.84 23
accuracy 0.97 519
macro avg 0.91 0.88 0.89 519
weighted avg 0.97 0.97 0.96 519
In [ ]:
In [ ]:
#OR
In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the car evaluation dataset
data = pd.read_csv("car_safety.csv")
# Encoding all the string data
data = data.apply(LabelEncoder().fit_transform)
# Define the features (X) and the target variable (y)
X = data.iloc[:, :-1] # Features (all columns except the last one)
y = data.iloc[:, -1] # Target variable (last column)
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", classification_rep)
Accuracy: 0.9624277456647399
Confusion Matrix:
[[ 72 1 3 1]
[ 2 10 0 3]
[ 1 0 236 0]
[ 2 0 0 15]]
Classification Report:
precision recall f1-score support
0 0.94 0.94 0.94 77
1 0.91 0.67 0.77 15
2 0.99 1.00 0.99 237
3 0.79 0.88 0.83 17
accuracy 0.96 346
macro avg 0.91 0.87 0.88 346
weighted avg 0.96 0.96 0.96 346
In [9]:
!pip install xgboost
Collecting xgboost
Downloading xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Requirement already satisfied: numpy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.26.3)
Requirement already satisfied: scipy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.11.4)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
---------------------------------------- 0.0/124.9 MB ? eta -:--:--
--------------------------------------- 1.8/124.9 MB 10.1 MB/s eta 0:00:13
- -------------------------------------- 3.1/124.9 MB 8.0 MB/s eta 0:00:16
- -------------------------------------- 4.2/124.9 MB 7.2 MB/s eta 0:00:17
- -------------------------------------- 5.2/124.9 MB 6.8 MB/s eta 0:00:18
-- ------------------------------------- 6.3/124.9 MB 6.1 MB/s eta 0:00:20
-- ------------------------------------- 7.3/124.9 MB 5.9 MB/s eta 0:00:20
-- ------------------------------------- 8.7/124.9 MB 6.0 MB/s eta 0:00:20
-- ------------------------------------- 9.2/124.9 MB 5.8 MB/s eta 0:00:20
--- ------------------------------------ 10.2/124.9 MB 5.5 MB/s eta 0:00:21
--- ------------------------------------ 11.3/124.9 MB 5.5 MB/s eta 0:00:21
---- ----------------------------------- 12.6/124.9 MB 5.6 MB/s eta 0:00:21
---- ----------------------------------- 14.2/124.9 MB 5.7 MB/s eta 0:00:20
----- ---------------------------------- 15.7/124.9 MB 5.9 MB/s eta 0:00:19
----- ---------------------------------- 16.5/124.9 MB 5.7 MB/s eta 0:00:19
----- ---------------------------------- 17.3/124.9 MB 5.6 MB/s eta 0:00:20
----- ---------------------------------- 18.1/124.9 MB 5.6 MB/s eta 0:00:20
------ --------------------------------- 19.4/124.9 MB 5.6 MB/s eta 0:00:19
------ --------------------------------- 21.2/124.9 MB 5.7 MB/s eta 0:00:19
------- -------------------------------- 22.8/124.9 MB 5.8 MB/s eta 0:00:18
------- -------------------------------- 23.9/124.9 MB 5.9 MB/s eta 0:00:18
------- -------------------------------- 24.6/124.9 MB 5.7 MB/s eta 0:00:18
-------- ------------------------------- 25.4/124.9 MB 5.7 MB/s eta 0:00:18
-------- ------------------------------- 26.7/124.9 MB 5.6 MB/s eta 0:00:18
-------- ------------------------------- 28.0/124.9 MB 5.7 MB/s eta 0:00:18
--------- ------------------------------ 29.9/124.9 MB 5.8 MB/s eta 0:00:17
--------- ------------------------------ 31.2/124.9 MB 5.8 MB/s eta 0:00:17
---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
---------- ----------------------------- 31.7/124.9 MB 4.8 MB/s eta 0:00:20
---------- ----------------------------- 32.2/124.9 MB 4.7 MB/s eta 0:00:20
---------- ----------------------------- 33.8/124.9 MB 4.8 MB/s eta 0:00:20
----------- ---------------------------- 35.9/124.9 MB 4.9 MB/s eta 0:00:19
----------- ---------------------------- 36.4/124.9 MB 4.9 MB/s eta 0:00:19
----------- ---------------------------- 37.2/124.9 MB 4.9 MB/s eta 0:00:19
------------ --------------------------- 38.3/124.9 MB 4.9 MB/s eta 0:00:18
------------ --------------------------- 39.6/124.9 MB 4.9 MB/s eta 0:00:18
------------- -------------------------- 41.2/124.9 MB 4.9 MB/s eta 0:00:17
------------- -------------------------- 42.7/124.9 MB 5.0 MB/s eta 0:00:17
-------------- ------------------------- 44.0/124.9 MB 5.0 MB/s eta 0:00:17
-------------- ------------------------- 45.4/124.9 MB 5.1 MB/s eta 0:00:16
-------------- ------------------------- 46.4/124.9 MB 5.1 MB/s eta 0:00:16
--------------- ------------------------ 47.2/124.9 MB 5.0 MB/s eta 0:00:16
--------------- ------------------------ 48.5/124.9 MB 5.1 MB/s eta 0:00:16
---------------- ----------------------- 50.1/124.9 MB 5.1 MB/s eta 0:00:15
---------------- ----------------------- 51.6/124.9 MB 5.2 MB/s eta 0:00:15
---------------- ----------------------- 53.0/124.9 MB 5.2 MB/s eta 0:00:14
----------------- ---------------------- 54.0/124.9 MB 5.2 MB/s eta 0:00:14
----------------- ---------------------- 55.1/124.9 MB 5.2 MB/s eta 0:00:14
----------------- ---------------------- 56.1/124.9 MB 5.2 MB/s eta 0:00:14
------------------ --------------------- 57.4/124.9 MB 5.2 MB/s eta 0:00:13
------------------ --------------------- 58.7/124.9 MB 5.2 MB/s eta 0:00:13
------------------- -------------------- 59.8/124.9 MB 5.2 MB/s eta 0:00:13
------------------- -------------------- 61.1/124.9 MB 5.2 MB/s eta 0:00:13
-------------------- ------------------- 62.7/124.9 MB 5.3 MB/s eta 0:00:12
-------------------- ------------------- 64.0/124.9 MB 5.3 MB/s eta 0:00:12
-------------------- ------------------- 65.0/124.9 MB 5.3 MB/s eta 0:00:12
--------------------- ------------------ 66.3/124.9 MB 5.3 MB/s eta 0:00:12
--------------------- ------------------ 67.4/124.9 MB 5.3 MB/s eta 0:00:11
--------------------- ------------------ 68.7/124.9 MB 5.3 MB/s eta 0:00:11
---------------------- ----------------- 70.5/124.9 MB 5.4 MB/s eta 0:00:11
---------------------- ----------------- 71.3/124.9 MB 5.4 MB/s eta 0:00:10
----------------------- ---------------- 71.8/124.9 MB 5.3 MB/s eta 0:00:10
----------------------- ---------------- 72.9/124.9 MB 5.3 MB/s eta 0:00:10
----------------------- ---------------- 73.9/124.9 MB 5.3 MB/s eta 0:00:10
------------------------ --------------- 75.2/124.9 MB 5.3 MB/s eta 0:00:10
------------------------ --------------- 76.8/124.9 MB 5.3 MB/s eta 0:00:10
------------------------- -------------- 78.6/124.9 MB 5.4 MB/s eta 0:00:09
------------------------- -------------- 79.7/124.9 MB 5.4 MB/s eta 0:00:09
------------------------- -------------- 80.5/124.9 MB 5.4 MB/s eta 0:00:09
-------------------------- ------------- 81.5/124.9 MB 5.4 MB/s eta 0:00:09
-------------------------- ------------- 82.6/124.9 MB 5.4 MB/s eta 0:00:08
-------------------------- ------------- 84.1/124.9 MB 5.4 MB/s eta 0:00:08
--------------------------- ------------ 86.0/124.9 MB 5.4 MB/s eta 0:00:08
--------------------------- ------------ 87.3/124.9 MB 5.4 MB/s eta 0:00:07
---------------------------- ----------- 87.6/124.9 MB 5.4 MB/s eta 0:00:07
---------------------------- ----------- 88.3/124.9 MB 5.4 MB/s eta 0:00:07
---------------------------- ----------- 89.1/124.9 MB 5.4 MB/s eta 0:00:07
---------------------------- ----------- 90.4/124.9 MB 5.4 MB/s eta 0:00:07
----------------------------- ---------- 91.8/124.9 MB 5.4 MB/s eta 0:00:07
----------------------------- ---------- 93.6/124.9 MB 5.4 MB/s eta 0:00:06
------------------------------ --------- 95.2/124.9 MB 5.4 MB/s eta 0:00:06
------------------------------ --------- 95.9/124.9 MB 5.4 MB/s eta 0:00:06
------------------------------- -------- 97.0/124.9 MB 5.4 MB/s eta 0:00:06
------------------------------- -------- 98.0/124.9 MB 5.4 MB/s eta 0:00:05
------------------------------- -------- 99.4/124.9 MB 5.4 MB/s eta 0:00:05
-------------------------------- ------- 101.2/124.9 MB 5.4 MB/s eta 0:00:05
-------------------------------- ------- 102.5/124.9 MB 5.5 MB/s eta 0:00:05
--------------------------------- ------ 103.5/124.9 MB 5.5 MB/s eta 0:00:04
--------------------------------- ------ 104.6/124.9 MB 5.4 MB/s eta 0:00:04
--------------------------------- ------ 105.9/124.9 MB 5.5 MB/s eta 0:00:04
---------------------------------- ----- 107.2/124.9 MB 5.5 MB/s eta 0:00:04
---------------------------------- ----- 108.8/124.9 MB 5.5 MB/s eta 0:00:03
----------------------------------- ---- 109.6/124.9 MB 5.5 MB/s eta 0:00:03
----------------------------------- ---- 110.6/124.9 MB 5.5 MB/s eta 0:00:03
----------------------------------- ---- 111.7/124.9 MB 5.5 MB/s eta 0:00:03
------------------------------------ --- 113.2/124.9 MB 5.5 MB/s eta 0:00:03
------------------------------------ --- 115.1/124.9 MB 5.5 MB/s eta 0:00:02
------------------------------------- -- 115.9/124.9 MB 5.5 MB/s eta 0:00:02
------------------------------------- -- 116.9/124.9 MB 5.5 MB/s eta 0:00:02
------------------------------------- -- 118.0/124.9 MB 5.5 MB/s eta 0:00:02
-------------------------------------- - 119.3/124.9 MB 5.5 MB/s eta 0:00:02
-------------------------------------- - 121.1/124.9 MB 5.5 MB/s eta 0:00:01
--------------------------------------- 121.9/124.9 MB 5.5 MB/s eta 0:00:01
--------------------------------------- 123.2/124.9 MB 5.5 MB/s eta 0:00:01
--------------------------------------- 124.5/124.9 MB 5.5 MB/s eta 0:00:01
--------------------------------------- 124.8/124.9 MB 5.5 MB/s eta 0:00:01
---------------------------------------- 124.9/124.9 MB 5.5 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.2
In [ ]:
In [ ]:
In [ ]:
'''pract 5 - B B. Use different voting mechanism and Apply AdaBoost (Adaptive Boosting), Gradient
Tree Boosting (GBM), XGBoost classification on Iris dataset and compare the
performance of three models using different evaluation measures.
Dataset Link: https://www.kaggle.com/datasets/uciml/iris
In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize classifiers
ada_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)
gbm_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# Train the classifiers
ada_classifier.fit(X_train, y_train)
gbm_classifier.fit(X_train, y_train)
xgb_classifier.fit(X_train, y_train)
# Make predictions
ada_pred = ada_classifier.predict(X_test)
gbm_pred = gbm_classifier.predict(X_test)
xgb_pred = xgb_classifier.predict(X_test)
# Evaluate the models
def evaluate_model(predictions, model_name):
print(f"Evaluation for {model_name}:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}\n")
# Display evaluation for each model
evaluate_model(ada_pred, "AdaBoost")
evaluate_model(gbm_pred, "Gradient Boosting")
evaluate_model(xgb_pred, "XGBoost")
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.
warnings.warn(
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py:158: UserWarning: [23:54:35] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
Evaluation for AdaBoost:
Confusion Matrix:
[[19 0 0]
[ 0 13 0]
[ 0 0 13]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 19
1 1.00 1.00 1.00 13
2 1.00 1.00 1.00 13
accuracy 1.00 45
macro avg 1.00 1.00 1.00 45
weighted avg 1.00 1.00 1.00 45
Accuracy: 1.0
Evaluation for Gradient Boosting:
Confusion Matrix:
[[19 0 0]
[ 0 13 0]
[ 0 0 13]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 19
1 1.00 1.00 1.00 13
2 1.00 1.00 1.00 13
accuracy 1.00 45
macro avg 1.00 1.00 1.00 45
weighted avg 1.00 1.00 1.00 45
Accuracy: 1.0
Evaluation for XGBoost:
Confusion Matrix:
[[19 0 0]
[ 0 13 0]
[ 0 0 13]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 19
1 1.00 1.00 1.00 13
2 1.00 1.00 1.00 13
accuracy 1.00 45
macro avg 1.00 1.00 1.00 45
weighted avg 1.00 1.00 1.00 45
Accuracy: 1.0
In [ ]:
In [ ]:
'''Pract 6- C. Build a Tic-Tac-Toe game using reinforcement learning in Python by using following
tasks
a. Setting up the environment
b. Defining the Tic-Tac-Toe game
c. Building the reinforcement learning model
d. Training the model
e. Testing the model
In [15]:
import numpy as np
import random
class TicTacToe:
def __init__(self):
self.board = np.zeros((3, 3), dtype=int)
self.done = False
self.winner = None
def reset(self):
self.board = np.zeros((3, 3), dtype=int)
self.done = False
self.winner = None
return self.board
def check_winner(self):
for i in range(3):
if abs(sum(self.board[i, :])) == 3:
self.winner = np.sign(self.board[i, 0])
return True
if abs(sum(self.board[:, i])) == 3:
self.winner = np.sign(self.board[0, i])
return True
if abs(self.board[0, 0] + self.board[1, 1] + self.board[2, 2]) == 3:
self.winner = np.sign(self.board[0, 0])
return True
if abs(self.board[0, 2] + self.board[1, 1] + self.board[2, 0]) == 3:
self.winner = np.sign(self.board[0, 2])
return True
if not np.any(self.board == 0):
self.winner = 0 # Draw
return True
return False
def step(self, player, action):
if self.board[action[0], action[1]] != 0 or self.done:
return False
self.board[action[0], action[1]] = player
self.done = self.check_winner()
return True
def available_actions(self):
return list(zip(*np.where(self.board == 0)))
# Q-learning agent
class QLearningAgent:
def __init__(self, alpha=0.1, gamma=0.9, epsilon=1.0):
self.q_table = {}
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
def get_q_values(self, state):
return self.q_table.get(state.tobytes(), np.zeros(9))
def choose_action(self, state, available_actions):
if random.random() < self.epsilon:
return random.choice(available_actions)
q_values = self.get_q_values(state)
best_action_index = np.argmax(q_values)
best_action = available_actions[best_action_index % len(available_actions)]
return best_action
def update_q_value(self, state, action, reward, next_state):
q_values = self.get_q_values(state)
action_index = action[0] * 3 + action[1]
next_max = np.max(self.get_q_values(next_state))
q_values[action_index] += self.alpha * (reward + self.gamma * next_max - q_values[action_index])
self.q_table[state.tobytes()] = q_values
# Training the agent
env = TicTacToe()
agent = QLearningAgent()
episodes = 1000
for episode in range(episodes):
state = env.reset()
done = False
while not done:
available_actions = env.available_actions()
action = agent.choose_action(state, available_actions)
player = 1 if episode % 2 == 0 else -1 # Alternate players
env.step(player, action)
next_state = env.board
reward = 1 if env.winner == player else -1 if env.winner == -player else 0
agent.update_q_value(state, action, reward, next_state)
state = next_state
if env.done:
break
if episode % 100 == 0:
print(f"Episode {episode} completed.")
# Testing the agent
for test_game in range(3):
state = env.reset()
done = False
print(f"\nTest Game {test_game + 1}")
while not done:
available_actions = env.available_actions()
action = agent.choose_action(state, available_actions)
player = 1 if test_game % 2 == 0 else -1
env.step(player, action)
print(env.board)
state = env.board
if env.done:
if env.winner == 1:
print("Agent (Player 1) wins!")
elif env.winner == -1:
print("Opponent (Player -1) wins!")
else:
print("It's a draw!")
break
Episode 0 completed. Episode 100 completed. Episode 200 completed. Episode 300 completed. Episode 400 completed. Episode 500 completed. Episode 600 completed. Episode 700 completed. Episode 800 completed. Episode 900 completed. Test Game 1 [[1 0 0] [0 0 0] [0 0 0]] [[1 0 0] [0 1 0] [0 0 0]] [[1 0 0] [1 1 0] [0 0 0]] [[1 1 0] [1 1 0] [0 0 0]] [[1 1 1] [1 1 0] [0 0 0]] Agent (Player 1) wins! Test Game 2 [[ 0 -1 0] [ 0 0 0] [ 0 0 0]] [[-1 -1 0] [ 0 0 0] [ 0 0 0]] [[-1 -1 -1] [ 0 0 0] [ 0 0 0]] Opponent (Player -1) wins! Test Game 3 [[0 0 0] [0 0 0] [0 1 0]] [[0 0 1] [0 0 0] [0 1 0]] [[1 0 1] [0 0 0] [0 1 0]] [[1 0 1] [1 0 0] [0 1 0]] [[1 1 1] [1 0 0] [0 1 0]] Agent (Player 1) wins!
In [ ]:
In [ ]:
#OR
In [16]:
import numpy as np
import random
# Task a & b: Setting up the Tic-Tac-Toe environment
class TicTacToeEnv:
def __init__(self):
self.reset()
def reset(self):
self.board = np.zeros((3, 3), dtype=int)
self.done = False
self.current_player = 1 # 1 for 'X', -1 for 'O'
return tuple(self.board.flatten())
def available_actions(self):
return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]
def step(self, action):
if self.done:
return tuple(self.board.flatten()), 0, True # Game is already over
i, j = action
self.board[i, j] = self.current_player
reward = self.check_winner()
self.done = reward != 0 or not self.available_actions()
self.current_player *= -1
return tuple(self.board.flatten()), reward, self.done
def check_winner(self):
for i in range(3):
if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
return 1 * self.current_player
if abs(sum(self.board.diagonal())) == 3 or abs(sum(np.fliplr(self.board).diagonal())) == 3:
return 1 * self.current_player
return 0
# Task c: Building the Q-learning model
Q = {}
def choose_action(state, epsilon=0.1):
if state not in Q:
Q[state] = {a: 0 for a in env.available_actions()}
return random.choice(env.available_actions()) if random.random() < epsilon else max(Q[state], key=Q[state].get)
def update_q(state, action, reward, next_state, alpha=0.1, gamma=0.95):
if state not in Q:
Q[state] = {a: 0 for a in env.available_actions()}
if next_state not in Q:
# Set Q[next_state] with a default value of 0 if no available actions
Q[next_state] = {a: 0 for a in env.available_actions()} or {(0, 0): 0}
# Q-learning update rule with terminal state check
max_future_q = max(Q[next_state].values()) if Q[next_state] else 0
Q[state][action] += alpha * (reward + gamma * max_future_q - Q[state][action])
# Task d: Training the model
env = TicTacToeEnv()
for episode in range(10000):
state = env.reset()
done = False
while not done:
action = choose_action(state)
next_state, reward, done = env.step(action)
update_q(state, action, reward, next_state)
state = next_state
# Task e: Testing the model
def test_model():
state = env.reset()
done = False
while not done:
action = choose_action(state, epsilon=0) # Always exploit
next_state, reward, done = env.step(action)
print(np.reshape(next_state, (3, 3)))
state = next_state
if done:
if reward > 0:
print("AI won!")
elif reward < 0:
print("AI lost!")
else:
print("It's a draw!")
# Run a test
test_model()
[[1 0 0] [0 0 0] [0 0 0]] [[ 1 -1 0] [ 0 0 0] [ 0 0 0]] [[ 1 -1 0] [ 0 0 0] [ 1 0 0]] [[ 1 -1 -1] [ 0 0 0] [ 1 0 0]] [[ 1 -1 -1] [ 1 0 0] [ 1 0 0]] AI won!
In [ ]:
In [ ]:
In [ ]:
In [ ]:
#---------------------------------------------------------DMV-----------------------------------------------------------------------------------
In [ ]:
In [ ]:
'''Pract 7 - 7 Data Loading, Storage and File Formats
Problem Statement: Analyzing Sales Data from Multiple File Formats
Dataset: Sales data in multiple file formats (e.g., CSV, Excel, JSON)
Description: The goal is to load and analyze sales data from different file formats, including
CSV, Excel, and JSON, and perform data cleaning, transformation, and analysis on the
dataset.
Tasks to Perform:
Obtain sales data files in various formats, such as CSV, Excel, and JSON.
1. Load the sales data from each file format into the appropriate data structures or
dataframes.
2. Explore the structure and content of the loaded data, identifying any inconsistencies,
missing values, or data quality issues.
3. Perform data cleaning operations, such as handling missing values, removing
duplicates, or correcting inconsistencies.
4. Convert the data into a unified format, such as a common dataframe or data structure,
to enable seamless analysis
5. Perform data transformation tasks, such as merging multiple datasets, splitting
columns, or deriving new variables.
6. Analyze the sales data by performing descriptive statistics, aggregating data by
specific variables, or calculating metrics such as total sales, average order value, or
product category distribution.
7. Create visualizations, such as bar plots, pie charts, or box plots, to represent the sales
data and gain insights into sales trends, customer behavior, or product performance.
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
# Load CSV
df_csv = pd.read_csv("sales_data_sample.csv" , encoding="cp1252")
# Load Excel
df_excel = pd.read_excel("Sample-Sales-Data.xlsx")
# Load JSON
df_json = pd.read_json("sales3.json")
print("CSV Data Head:\n", df_csv.head())
print("Excel Data Head:\n", df_excel.head())
print("JSON Data Head:\n", df_json.head())
print("\nCSV Data Info:")
print(df_csv.info())
print("\nExcel Data Info:")
print(df_excel.info())
print("\nJSON Data Info:")
print(df_json.info())
# Check for missing values
print("CSV Missing Values:\n", df_csv.isnull().sum())
print("Excel Missing Values:\n", df_excel.isnull().sum())
print("JSON Missing Values:\n", df_json.isnull().sum())
# Fill or drop missing values (example: filling with mean for numerical columns)
df_csv.fillna(df_csv.mean(), inplace=True)
df_excel.fillna(df_excel.mean(), inplace=True)
df_json.fillna(df_json.mean(), inplace=True)
# Remove duplicates
df_csv.drop_duplicates(inplace=True)
df_excel.drop_duplicates(inplace=True)
df_json.drop_duplicates(inplace=True)
# Standardize column names across datasets if they differ
df_csv.columns = df_csv.columns.str.lower()
df_excel.columns = df_excel.columns.str.lower()
df_json.columns = df_json.columns.str.lower()
print(df_csv.dtypes)
print(df_excel.dtypes)
print(df_json.dtypes)
'''
# Combine datasets into a single DataFrame
combined_df = pd.concat([df_csv, df_excel, df_json], ignore_index=True)
print("Combined Data Head:\n", combined_df.head())
# Example: Create new column for Total Sales
combined_df['total_sales'] = combined_df['quantity'] * combined_df['price']
# Example: Extract year and month from a 'date' column
combined_df['year'] = pd.DatetimeIndex(combined_df['date']).year
combined_df['month'] = pd.DatetimeIndex(combined_df['date']).month
# Calculate total and average sales
total_sales = combined_df['total_sales'].sum()
average_order_value = combined_df['total_sales'].mean()
# Aggregate sales by product category
sales_by_category = combined_df.groupby('category')['total_sales'].sum()
# Display analysis results
print("Total Sales:", total_sales)
print("Average Order Value:", average_order_value)
print("Sales by Category:\n", sales_by_category)
# Set plot style
sns.set(style="whitegrid")
# Bar plot for sales by category
plt.figure(figsize=(10, 6))
sns.barplot(x=sales_by_category.index, y=sales_by_category.values)
plt.title("Total Sales by Product Category")
plt.xlabel("Category")
plt.ylabel("Total Sales")
plt.xticks(rotation=45)
plt.show()
# Pie chart for category distribution
plt.figure(figsize=(8, 8))
combined_df['category'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Product Category Distribution")
plt.ylabel("")
plt.show()
# Box plot to analyze order values
plt.figure(figsize=(10, 6))
sns.boxplot(x="category", y="total_sales", data=combined_df)
plt.title("Order Value Distribution by Category")
plt.xlabel("Category")
plt.ylabel("Order Value")
plt.xticks(rotation=45)
plt.show()
'''
CSV Data Head:
ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES \
0 10107 30 95.70 2 2871.00
1 10121 34 81.35 5 2765.90
2 10134 41 94.74 2 3884.34
3 10145 45 83.26 6 3746.70
4 10159 49 100.00 14 5205.27
ORDERDATE STATUS QTR_ID MONTH_ID YEAR_ID ... \
0 2/24/2003 0:00 Shipped 1 2 2003 ...
1 5/7/2003 0:00 Shipped 2 5 2003 ...
2 7/1/2003 0:00 Shipped 3 7 2003 ...
3 8/25/2003 0:00 Shipped 3 8 2003 ...
4 10/10/2003 0:00 Shipped 4 10 2003 ...
ADDRESSLINE1 ADDRESSLINE2 CITY STATE \
0 897 Long Airport Avenue NaN NYC NY
1 59 rue de l'Abbaye NaN Reims NaN
2 27 rue du Colonel Pierre Avia NaN Paris NaN
3 78934 Hillside Dr. NaN Pasadena CA
4 7734 Strong St. NaN San Francisco CA
POSTALCODE COUNTRY TERRITORY CONTACTLASTNAME CONTACTFIRSTNAME DEALSIZE
0 10022 USA NaN Yu Kwai Small
1 51100 France EMEA Henriot Paul Small
2 75508 France EMEA Da Cunha Daniel Medium
3 90003 USA NaN Young Julie Medium
4 NaN USA NaN Brown Julie Medium
[5 rows x 25 columns]
Excel Data Head:
Postcode Sales_Rep_ID Sales_Rep_Name Year Value
0 2121 456 Jane 2011 84219.497311
1 2092 789 Ashish 2012 28322.192268
2 2128 456 Jane 2013 81878.997241
3 2073 123 John 2011 44491.142121
4 2134 789 Ashish 2012 71837.720959
JSON Data Head:
id email first last \
0 1 isidro_von@hotmail.com Torrey Veum
1 2 frederique19@gmail.com Micah Sanford
2 3 fredy54@gmail.com Hollis Swift
3 4 braxton29@hotmail.com Perry Leffler
4 5 turner59@gmail.com Janelle Hagenes
company created_at \
0 Hilll, Mayert and Wolf 2014-12-25 04:06:27.981000+00:00
1 Stokes-Reichel 2014-07-03 16:08:17.044000+00:00
2 Rodriguez, Cartwright and Kuhn 2014-08-18 06:15:16.731000+00:00
3 Sipes, Feeney and Hansen 2014-07-10 11:31:40.235000+00:00
4 Lesch and Daughters 2014-04-21 15:05:43.229000+00:00
country
0 Switzerland
1 Democratic People's Republic of Korea
2 Tunisia
3 Chad
4 Swaziland
CSV Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDERNUMBER 2823 non-null int64
1 QUANTITYORDERED 2823 non-null int64
2 PRICEEACH 2823 non-null float64
3 ORDERLINENUMBER 2823 non-null int64
4 SALES 2823 non-null float64
5 ORDERDATE 2823 non-null object
6 STATUS 2823 non-null object
7 QTR_ID 2823 non-null int64
8 MONTH_ID 2823 non-null int64
9 YEAR_ID 2823 non-null int64
10 PRODUCTLINE 2823 non-null object
11 MSRP 2823 non-null int64
12 PRODUCTCODE 2823 non-null object
13 CUSTOMERNAME 2823 non-null object
14 PHONE 2823 non-null object
15 ADDRESSLINE1 2823 non-null object
16 ADDRESSLINE2 302 non-null object
17 CITY 2823 non-null object
18 STATE 1337 non-null object
19 POSTALCODE 2747 non-null object
20 COUNTRY 2823 non-null object
21 TERRITORY 1749 non-null object
22 CONTACTLASTNAME 2823 non-null object
23 CONTACTFIRSTNAME 2823 non-null object
24 DEALSIZE 2823 non-null object
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
None
Excel Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Postcode 390 non-null int64
1 Sales_Rep_ID 390 non-null int64
2 Sales_Rep_Name 390 non-null object
3 Year 390 non-null int64
4 Value 390 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 15.4+ KB
None
JSON Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 9999 non-null int64
1 email 9999 non-null object
2 first 9999 non-null object
3 last 9999 non-null object
4 company 9999 non-null object
5 created_at 9999 non-null datetime64[ns, UTC]
6 country 9999 non-null object
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 546.9+ KB
None
CSV Missing Values:
ORDERNUMBER 0
QUANTITYORDERED 0
PRICEEACH 0
ORDERLINENUMBER 0
SALES 0
ORDERDATE 0
STATUS 0
QTR_ID 0
MONTH_ID 0
YEAR_ID 0
PRODUCTLINE 0
MSRP 0
PRODUCTCODE 0
CUSTOMERNAME 0
PHONE 0
ADDRESSLINE1 0
ADDRESSLINE2 2521
CITY 0
STATE 1486
POSTALCODE 76
COUNTRY 0
TERRITORY 1074
CONTACTLASTNAME 0
CONTACTFIRSTNAME 0
DEALSIZE 0
dtype: int64
Excel Missing Values:
Postcode 0
Sales_Rep_ID 0
Sales_Rep_Name 0
Year 0
Value 0
dtype: int64
JSON Missing Values:
id 0
email 0
first 0
last 0
company 0
created_at 0
country 0
dtype: int64
ORDERNUMBER int64
QUANTITYORDERED int64
PRICEEACH float64
ORDERLINENUMBER int64
SALES float64
ORDERDATE object
STATUS object
QTR_ID int64
MONTH_ID int64
YEAR_ID int64
PRODUCTLINE object
MSRP int64
PRODUCTCODE object
CUSTOMERNAME object
PHONE object
ADDRESSLINE1 object
ADDRESSLINE2 object
CITY object
STATE object
POSTALCODE object
COUNTRY object
TERRITORY object
CONTACTLASTNAME object
CONTACTFIRSTNAME object
DEALSIZE object
dtype: object
Postcode int64
Sales_Rep_ID int64
Sales_Rep_Name object
Year int64
Value float64
dtype: object
id int64
email object
first object
last object
company object
created_at datetime64[ns, UTC]
country object
dtype: object
Out[3]:
'\n# Combine datasets into a single DataFrame\ncombined_df = pd.concat([df_csv, df_excel, df_json], ignore_index=True)\nprint("Combined Data Head:\n", combined_df.head())\n\n\n\n# Example: Create new column for Total Sales\ncombined_df[\'total_sales\'] = combined_df[\'quantity\'] * combined_df[\'price\']\n\n# Example: Extract year and month from a \'date\' column\ncombined_df[\'year\'] = pd.DatetimeIndex(combined_df[\'date\']).year\ncombined_df[\'month\'] = pd.DatetimeIndex(combined_df[\'date\']).month\n\n\n\n# Calculate total and average sales\ntotal_sales = combined_df[\'total_sales\'].sum()\naverage_order_value = combined_df[\'total_sales\'].mean()\n\n# Aggregate sales by product category\nsales_by_category = combined_df.groupby(\'category\')[\'total_sales\'].sum()\n\n# Display analysis results\nprint("Total Sales:", total_sales)\nprint("Average Order Value:", average_order_value)\nprint("Sales by Category:\n", sales_by_category)\n\n\n\n# Set plot style\nsns.set(style="whitegrid")\n\n# Bar plot for sales by category\nplt.figure(figsize=(10, 6))\nsns.barplot(x=sales_by_category.index, y=sales_by_category.values)\nplt.title("Total Sales by Product Category")\nplt.xlabel("Category")\nplt.ylabel("Total Sales")\nplt.xticks(rotation=45)\nplt.show()\n\n# Pie chart for category distribution\nplt.figure(figsize=(8, 8))\ncombined_df[\'category\'].value_counts().plot.pie(autopct=\'%1.1f%%\')\nplt.title("Product Category Distribution")\nplt.ylabel("")\nplt.show()\n\n# Box plot to analyze order values\nplt.figure(figsize=(10, 6))\nsns.boxplot(x="category", y="total_sales", data=combined_df)\nplt.title("Order Value Distribution by Category")\nplt.xlabel("Category")\nplt.ylabel("Order Value")\nplt.xticks(rotation=45)\nplt.show()\n'
In [ ]:
In [24]:
!pip install openpyxl
Collecting openpyxl Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB) Collecting et-xmlfile (from openpyxl) Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB) Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB) Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB) Installing collected packages: et-xmlfile, openpyxl Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
In [ ]:
In [ ]:
#OR
In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import json
csv = pd.read_csv("sales_data_sample.csv", encoding="cp1252")
ed = pd.read_excel("Sample-Sales-Data.xlsx")
with open("customers.json", "r") as json_file:
json_data = json.load(json_file)
csv.tail()
csv.info()
csv.describe()
csv.dropna()
csv.drop_duplicates()
ed.head()
ed.tail()
ed.info()
ed.describe()
unified_data = pd.concat([csv, ed], ignore_index=True)
total_sales = unified_data['SALES'].sum()
print("Total Sales:", total_sales)
category_sales = unified_data.groupby('ORDERNUMBER')['SALES'].mean()
category_counts = unified_data['SALES'].value_counts()
category_counts.plot(kind='bar')
plt.title('Product Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null object 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 302 non-null object 17 CITY 2823 non-null object 18 STATE 1337 non-null object 19 POSTALCODE 2747 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 1749 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: float64(2), int64(7), object(16) memory usage: 551.5+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 390 entries, 0 to 389 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Postcode 390 non-null int64 1 Sales_Rep_ID 390 non-null int64 2 Sales_Rep_Name 390 non-null object 3 Year 390 non-null int64 4 Value 390 non-null float64 dtypes: float64(1), int64(3), object(1) memory usage: 15.4+ KB Total Sales: 10032628.85
In [ ]:
In [ ]:
#OR
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df1=pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1')
df2=pd.read_json('sales.json')
df3=pd.read_excel('sales_data.xlsx')
df1.head()
df2.head()
df3.head()
df1.shape
df2.shape
df3.shape
df1.info()
#Checks for NA values in columns
df1.isna().sum()
#for calculating some statistical data like percentile, mean and std of the numeric
df1.describe()
#Dropping unnecessary columns
df1 = df1.drop(['ADDRESSLINE1','ADDRESSLINE2','CITY','STATE','TERRITORY'],axis = 1)
#Filling all NA values with mode of the POSTALCODE column
df1 = df1['POSTALCODE'].fillna(df1.POSTALCODE.mode(), inplace=True)
df2.info()
df2.isna().sum()
df2 = df2.drop(['ADDRESSLINE1','ADDRESSLINE2','CITY','STATE','TERRITORY'],axis = 1)
df2.describe()
df3.info()
df2.isna().sum()
df3.describe()
import pandas as pd
# Load the CSV file into a DataFrame
df1 = pd.read_csv('sales_data_sample.csv',encoding='ISO-8859-1')
df2 = pd.read_json('sales.json')
df3 = pd.read_excel('sales_data.xlsx')
# Check the data types of columns
data_types = df1.dtypes
print(data_types)
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming your DataFrame is named df1
sns.histplot(x='STATUS', data=df1)
plt.show()
#Plotting histogram plot for STATUS column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='STATUS', data=df2, )
plt.show()
#Plotting histogram plot for STATUS column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='STATUS', data=df3, )
plt.show()
#Plotting histogram plot for MONTH_ID column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='MONTH_ID', data=df1, )
plt.show()
#Plotting histogram plot for MONTH_ID column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='MONTH_ID', data=df2, )
plt.show()
#Plotting histogram plot for MONTH_ID column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='MONTH_ID', data=df3, )
plt.show()
#Plotting boxplot for STATUS column against MONTH_ID column
sns.boxplot( x="STATUS", y= "MONTH_ID", data=df1, )
plt.show()
#Plotting boxplot for STATUS column against MONTH_ID column
sns.boxplot( x="STATUS", y= "MONTH_ID", data=df2, )
plt.show()
#Plotting boxplot for STATUS column against MONTH_ID column
sns.boxplot( x="STATUS", y= "MONTH_ID", data=df3, )
plt.show()
#Plotting Scatterplot
sns.scatterplot( x="STATUS", y="MONTH_ID", data=df1,
hue='COUNTRY', size='YEAR_ID')
# Placing Legend outside the Figure
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()
#Plotting Scatterplot
sns.scatterplot( x="STATUS", y="MONTH_ID", data=df2,
hue='COUNTRY', size='YEAR_ID')
# Placing Legend outside the Figure
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()
#Plotting Scatterplot
sns.scatterplot( x="STATUS", y="MONTH_ID", data=df3,
hue='COUNTRY', size='YEAR_ID')
# Placing Legend outside the Figure
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()
#Checking the data only for shipped STATUS
data1=df1[df1["STATUS"]=='Shipped']
data1.head()
#Checking the data only for shipped STATUS
data2=df2[df2["STATUS"]=='Shipped']
data2.head()
#Checking the data only for shipped STATUS
data3=df3[df3["STATUS"]=='Shipped']
data3.head()
data1.shape
data2.shape
data3.shape
#Calculating sum for sales column
sum_sales = df1['SALES'].sum()
print("Addition of all sales",sum_sales)
#Calculating sum for sales column
sum_sales = df2['SALES'].sum()
print("Addition of all sales",sum_sales)
#Calculating sum for sales column
sum_sales = df3['SALES'].sum()
print("Addition of all sales",sum_sales)
#Calulating average for sales column
sales_avg = df1['SALES'].mean()
print("Average of total sales = ",sales_avg)
#Calulating average for sales column
sales_avg = df2['SALES'].mean()
print("Average of total sales = ",sales_avg)
#Calulating average for sales column
sales_avg = df3['SALES'].mean()
print("Average of total sales = ",sales_avg)
import sklearn
import pandas as pd
import seaborn as sns
# IQR
Q1 = np.percentile(df1['SALES'], 25,
interpolation = 'midpoint')
Q3 = np.percentile(df1['SALES'], 75,
interpolation = 'midpoint')
IQR = Q3 - Q1
print("Old Shape: ", df1.shape)
# Upper bound
upper = np.where(df1['SALES'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df1['SALES'] <= (Q1-1.5*IQR))
# Removing the Outliers
df1.drop(upper[0], inplace = True)
df1.drop(lower[0], inplace = True)
print("New Shape: ", df1.shape)
sns.boxplot(x='SALES', data=df1)
import sklearn
import pandas as pd
import seaborn as sns
# IQR
Q1 = np.percentile(df2['SALES'], 25,
interpolation = 'midpoint')
Q3 = np.percentile(df2['SALES'], 75,
interpolation = 'midpoint')
IQR = Q3 - Q1
print("Old Shape: ", df2.shape)
# Upper bound
upper = np.where(df2['SALES'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df2['SALES'] <= (Q1-1.5*IQR))
# Removing the Outliers
df2.drop(upper[0], inplace = True)
import sklearn
import pandas as pd
import seaborn as sns
# IQR
Q1 = np.percentile(df3['SALES'], 25,
interpolation = 'midpoint')
Q3 = np.percentile(df3['SALES'], 75,
interpolation = 'midpoint')
IQR = Q3 - Q1
print("Old Shape: ", df3.shape)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null object 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 302 non-null object 17 CITY 2823 non-null object 18 STATE 1337 non-null object 19 POSTALCODE 2747 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 1749 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: float64(2), int64(7), object(16) memory usage: 551.5+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null object 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 2823 non-null object 17 CITY 2823 non-null object 18 STATE 2823 non-null object 19 POSTALCODE 2823 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 2823 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: float64(2), int64(7), object(16) memory usage: 551.5+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null object 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 302 non-null object 17 CITY 2823 non-null object 18 STATE 1337 non-null object 19 POSTALCODE 2747 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 1749 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: float64(2), int64(7), object(16) memory usage: 551.5+ KB ORDERNUMBER int64 QUANTITYORDERED int64 PRICEEACH float64 ORDERLINENUMBER int64 SALES float64 ORDERDATE object STATUS object QTR_ID int64 MONTH_ID int64 YEAR_ID int64 PRODUCTLINE object MSRP int64 PRODUCTCODE object CUSTOMERNAME object PHONE object ADDRESSLINE1 object ADDRESSLINE2 object CITY object STATE object POSTALCODE object COUNTRY object TERRITORY object CONTACTLASTNAME object CONTACTFIRSTNAME object DEALSIZE object dtype: object
Addition of all sales 10032628.85 Addition of all sales 10032628.85 Addition of all sales 10032628.85 Average of total sales = 3553.889071909316 Average of total sales = 3553.889071909316 Average of total sales = 3553.889071909316 Old Shape: (2823, 25) New Shape: (2742, 25) Old Shape: (2823, 25) Old Shape: (2823, 25)
In [ ]:
In [ ]:
#OR
In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re # Importing the re module for regular expressions
# Load datasets
dcsv = pd.read_csv('customers.csv')
djson = pd.read_json('customers.json')
dxlsx = pd.read_excel('customers.xlsx')
# Display first few rows of each dataset
print(dcsv.head())
print(djson.head())
print(dxlsx.head())
# Check for missing values in each dataset
print(f"Missing values in csv\n{dcsv.isna().sum()}")
print(f"\nMissing values in json\n{djson.isna().sum()}")
print(f"\nMissing values in xlsx\n{dxlsx.isna().sum()}")
# Get information about each dataset
print(f"info of csv")
dcsv.info()
print(f"\ninfo of json")
djson.info()
print(f"\ninfo of xlsx")
dxlsx.info()
# Fill missing values with 0
dcsv.fillna(0, inplace=True)
djson.fillna(0, inplace=True)
dxlsx.fillna(0, inplace=True)
# Check for duplicates
print("Duplicate entries in csv:\n", dcsv[dcsv.duplicated()])
print("Duplicate entries in json:\n", djson[djson.duplicated()])
print("Duplicate entries in xlsx:\n", dxlsx[dxlsx.duplicated()])
# Drop duplicates
dcsv.drop_duplicates(inplace=True)
djson.drop_duplicates(inplace=True)
dxlsx.drop_duplicates(inplace=True)
# Concatenate dataframes
uni_df = pd.concat([dcsv, djson, dxlsx], ignore_index=True)
print(f"Combined dataset shape: {uni_df.shape}")
# Create a full name column
dcsv['full name'] = dcsv['first_name'] + ' ' + dcsv['last_name']
print(dcsv.head())
# Function to extract pin code from address
def extract_pin_code(address):
match = re.search(r'\b\d{5}\b', address)
return match.group(0) if match else None # Return None if no match is found
# Apply the function to create a pin code column
dcsv['pin code'] = [extract_pin_code(add) for add in dcsv['address']]
print(dcsv.head())
# Describe the combined dataframe
print(uni_df.describe())
# Aggregate data by job category
job_summary = uni_df.groupby('job').agg({'orders': 'sum', 'spent': 'mean'})
print(job_summary)
# Calculate total sales
total_sales = uni_df['spent'].sum()
print(f"Total Sales: {total_sales}")
# Calculate average spending
average_spent = uni_df['spent'].mean()
print(f"Average Spending: {average_spent}")
# Count of each job category
job_counts = uni_df['job'].value_counts()
print(job_counts)
# Visualizations
# Bar plot of sales by job category
plt.figure(figsize=(12, 6))
sns.barplot(x='job', y='spent', data=uni_df)
plt.title('Sales by Job Category')
plt.xlabel('Job Category')
plt.ylabel('Total Sales')
plt.xticks(rotation=90)
plt.show()
# Pie chart for job category distribution
plt.figure(figsize=(10, 6))
product_distribution = uni_df['job'].value_counts()
plt.pie(product_distribution, labels=product_distribution.index, autopct='%1.2f%%', startangle=140)
plt.title('Job Category Distribution')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# Box plot for order value distribution by job
plt.figure(figsize=(12, 6))
sns.boxplot(x='job', y='spent', data=uni_df)
plt.title('Order Value Distribution by Job')
plt.xlabel('Job')
plt.ylabel('Order Value')
plt.xticks(rotation=90)
plt.show()
first_name last_name email \
0 Joseph Rice josephrice131@slingacademy.com
1 Gary Moore garymoore386@slingacademy.com
2 John Walker johnwalker944@slingacademy.com
3 Eric Carter ericcarter176@slingacademy.com
4 William Jackson williamjackson427@slingacademy.com
phone address gender age \
0 +1-800-040-3135x6208 91773 Miller Shoal\nDiaztown, FL 38841 male 43
1 221.945.4191x8872 6450 John Lodge\nTerriton, KY 95945 male 71
2 388-142-4883x5370 27265 Murray Island\nKevinfort, PA 63231 male 44
3 (451)259-5402 USNS Knight\nFPO AA 76532 male 38
4 625.626.9133x374 170 Jackson Loaf\nKristenland, AS 48876 male 58
registered orders spent job hobbies \
0 2019-05-05 7 568.29 Artist Playing sports
1 2020-05-20 11 568.92 Artist Swimming
2 2020-04-04 11 497.12 Clerk Painting
3 2019-01-30 17 834.60 Butcher Playing musical instruments
4 2022-07-01 14 151.59 Engineer Reading
is_married
0 False
1 True
2 False
3 False
4 False
id email first last \
0 1 isidro_von@hotmail.com Torrey Veum
1 2 frederique19@gmail.com Micah Sanford
2 3 fredy54@gmail.com Hollis Swift
3 4 braxton29@hotmail.com Perry Leffler
4 5 turner59@gmail.com Janelle Hagenes
company created_at \
0 Hilll, Mayert and Wolf 2014-12-25 04:06:27.981000+00:00
1 Stokes-Reichel 2014-07-03 16:08:17.044000+00:00
2 Rodriguez, Cartwright and Kuhn 2014-08-18 06:15:16.731000+00:00
3 Sipes, Feeney and Hansen 2014-07-10 11:31:40.235000+00:00
4 Lesch and Daughters 2014-04-21 15:05:43.229000+00:00
country
0 Switzerland
1 Democratic People's Republic of Korea
2 Tunisia
3 Chad
4 Swaziland
first_name last_name email \
0 Joseph Rice josephrice131@slingacademy.com
1 Gary Moore garymoore386@slingacademy.com
2 John Walker johnwalker944@slingacademy.com
3 Eric Carter ericcarter176@slingacademy.com
4 William Jackson williamjackson427@slingacademy.com
phone address gender age \
0 +1-800-040-3135x6208 91773 Miller Shoal\nDiaztown, FL 38841 male 43
1 221.945.4191x8872 6450 John Lodge\nTerriton, KY 95945 male 71
2 388-142-4883x5370 27265 Murray Island\nKevinfort, PA 63231 male 44
3 (451)259-5402 USNS Knight\nFPO AA 76532 male 38
4 625.626.9133x374 170 Jackson Loaf\nKristenland, AS 48876 male 58
registered orders spent job hobbies \
0 2019-05-05 7 568.29 Artist Playing sports
1 2020-05-20 11 568.92 Artist Swimming
2 2020-04-04 11 497.12 Clerk Painting
3 2019-01-30 17 834.60 Butcher Playing musical instruments
4 2022-07-01 14 151.59 Engineer Reading
is_married
0 False
1 True
2 False
3 False
4 False
Missing values in csv
first_name 0
last_name 0
email 0
phone 0
address 0
gender 0
age 0
registered 0
orders 0
spent 0
job 0
hobbies 0
is_married 0
dtype: int64
Missing values in json
id 0
email 0
first 0
last 0
company 0
created_at 0
country 0
dtype: int64
Missing values in xlsx
first_name 0
last_name 0
email 0
phone 0
address 0
gender 0
age 0
registered 0
orders 0
spent 0
job 0
hobbies 0
is_married 0
dtype: int64
info of csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 first_name 1000 non-null object
1 last_name 1000 non-null object
2 email 1000 non-null object
3 phone 1000 non-null object
4 address 1000 non-null object
5 gender 1000 non-null object
6 age 1000 non-null int64
7 registered 1000 non-null object
8 orders 1000 non-null int64
9 spent 1000 non-null float64
10 job 1000 non-null object
11 hobbies 1000 non-null object
12 is_married 1000 non-null bool
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 94.9+ KB
info of json
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 9999 non-null int64
1 email 9999 non-null object
2 first 9999 non-null object
3 last 9999 non-null object
4 company 9999 non-null object
5 created_at 9999 non-null datetime64[ns, UTC]
6 country 9999 non-null object
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 546.9+ KB
info of xlsx
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 first_name 1000 non-null object
1 last_name 1000 non-null object
2 email 1000 non-null object
3 phone 1000 non-null object
4 address 1000 non-null object
5 gender 1000 non-null object
6 age 1000 non-null int64
7 registered 1000 non-null datetime64[ns]
8 orders 1000 non-null int64
9 spent 1000 non-null float64
10 job 1000 non-null object
11 hobbies 1000 non-null object
12 is_married 1000 non-null bool
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 94.9+ KB
Duplicate entries in csv:
Empty DataFrame
Columns: [first_name, last_name, email, phone, address, gender, age, registered, orders, spent, job, hobbies, is_married]
Index: []
Duplicate entries in json:
Empty DataFrame
Columns: [id, email, first, last, company, created_at, country]
Index: []
Duplicate entries in xlsx:
Empty DataFrame
Columns: [first_name, last_name, email, phone, address, gender, age, registered, orders, spent, job, hobbies, is_married]
Index: []
Combined dataset shape: (11999, 19)
first_name last_name email \
0 Joseph Rice josephrice131@slingacademy.com
1 Gary Moore garymoore386@slingacademy.com
2 John Walker johnwalker944@slingacademy.com
3 Eric Carter ericcarter176@slingacademy.com
4 William Jackson williamjackson427@slingacademy.com
phone address gender age \
0 +1-800-040-3135x6208 91773 Miller Shoal\nDiaztown, FL 38841 male 43
1 221.945.4191x8872 6450 John Lodge\nTerriton, KY 95945 male 71
2 388-142-4883x5370 27265 Murray Island\nKevinfort, PA 63231 male 44
3 (451)259-5402 USNS Knight\nFPO AA 76532 male 38
4 625.626.9133x374 170 Jackson Loaf\nKristenland, AS 48876 male 58
registered orders spent job hobbies \
0 2019-05-05 7 568.29 Artist Playing sports
1 2020-05-20 11 568.92 Artist Swimming
2 2020-04-04 11 497.12 Clerk Painting
3 2019-01-30 17 834.60 Butcher Playing musical instruments
4 2022-07-01 14 151.59 Engineer Reading
is_married full name
0 False Joseph Rice
1 True Gary Moore
2 False John Walker
3 False Eric Carter
4 False William Jackson
first_name last_name email \
0 Joseph Rice josephrice131@slingacademy.com
1 Gary Moore garymoore386@slingacademy.com
2 John Walker johnwalker944@slingacademy.com
3 Eric Carter ericcarter176@slingacademy.com
4 William Jackson williamjackson427@slingacademy.com
phone address gender age \
0 +1-800-040-3135x6208 91773 Miller Shoal\nDiaztown, FL 38841 male 43
1 221.945.4191x8872 6450 John Lodge\nTerriton, KY 95945 male 71
2 388-142-4883x5370 27265 Murray Island\nKevinfort, PA 63231 male 44
3 (451)259-5402 USNS Knight\nFPO AA 76532 male 38
4 625.626.9133x374 170 Jackson Loaf\nKristenland, AS 48876 male 58
registered orders spent job hobbies \
0 2019-05-05 7 568.29 Artist Playing sports
1 2020-05-20 11 568.92 Artist Swimming
2 2020-04-04 11 497.12 Clerk Painting
3 2019-01-30 17 834.60 Butcher Playing musical instruments
4 2022-07-01 14 151.59 Engineer Reading
is_married full name pin code
0 False Joseph Rice 91773
1 True Gary Moore 95945
2 False John Walker 27265
3 False Eric Carter 76532
4 False William Jackson 48876
age orders spent id
count 2000.000000 2000.00000 2000.000000 9999.000000
mean 48.145000 10.03600 496.295100 5000.000000
std 18.347869 6.17412 289.156882 2886.607005
min 18.000000 0.00000 0.490000 1.000000
25% 32.000000 5.00000 248.100000 2500.500000
50% 47.000000 10.00000 469.045000 5000.000000
75% 64.000000 15.00000 739.660000 7499.500000
max 80.000000 20.00000 999.150000 9999.000000
orders spent
job
Accountant 672.0 480.695882
Architect 436.0 549.016667
Artist 394.0 576.194583
Athlete 604.0 503.059630
Baker 654.0 469.264194
Barber 692.0 453.617143
Butcher 650.0 430.244687
Carpenter 314.0 521.049000
Cashier 554.0 541.296538
Chef 700.0 501.610937
Clerk 508.0 486.912083
Dentist 600.0 498.399667
Doctor 618.0 498.835000
Engineer 606.0 485.854231
Farmer 418.0 417.005217
Firefighter 530.0 421.117692
Hairdresser 552.0 474.699259
Housewife 524.0 470.946154
Janitor 640.0 528.832000
Lawyer 698.0 524.990000
Librarian 272.0 445.107143
Mechanic 590.0 547.160606
Nurse 502.0 430.046296
Officer 388.0 553.467000
Pilot 630.0 495.528750
Polic 492.0 418.494400
Politician 420.0 561.991739
Receptionist 560.0 561.750294
Scientist 432.0 475.320476
Security Guard 510.0 535.684815
Student 558.0 522.103077
Tailor 676.0 463.127059
Teacher 520.0 473.012500
Unemployed 678.0 433.800000
Unkown 400.0 579.054091
Waiter 478.0 572.398400
Waitress 602.0 475.518889
Total Sales: 992590.2
Average Spending: 496.2951
job
Lawyer 74
Barber 70
Tailor 68
Receptionist 68
Accountant 68
Mechanic 66
Chef 64
Butcher 64
Doctor 64
Baker 62
Dentist 60
Janitor 60
Unemployed 56
Hairdresser 54
Nurse 54
Athlete 54
Security Guard 54
Waitress 54
Cashier 52
Student 52
Firefighter 52
Housewife 52
Engineer 52
Polic 50
Waiter 50
Architect 48
Teacher 48
Pilot 48
Clerk 48
Artist 48
Farmer 46
Politician 46
Unkown 44
Scientist 42
Officer 40
Carpenter 40
Librarian 28
Name: count, dtype: int64
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
''' Pract 8 - Interacting with Web APIs
Problem Statement: Analyzing Weather Data from OpenWeatherMap API
Dataset: Weather data retrieved from OpenWeatherMap API
Description: The goal is to interact with the OpenWeatherMap API to retrieve weather data
for a specific location and perform data modeling and visualization to analyze weather
patterns over time.
Tasks to Perform:
1. Register and obtain API key from OpenWeatherMap.
2. Interact with the OpenWeatherMap API using the API key to retrieve weather data for
a specific location.
3. Extract relevant weather attributes such as temperature, humidity, wind speed, and
precipitation from the API response.
4. Clean and preprocess the retrieved data, handling missing values or inconsistent
formats.
5. Perform data modeling to analyze weather patterns, such as calculating average
temperature, maximum/minimum values, or trends over time.
6. Visualize the weather data using appropriate plots, such as line charts, bar plots, or
scatter plots, to represent temperature changes, precipitation levels, or wind speed
variations.
7. Apply data aggregation techniques to summarize weather statistics by specific time
periods (e.g., daily, monthly, seasonal).
8. Incorporate geographical information, if available, to create maps or geospatial
visualizations representing weather patterns across different locations.
9. Explore and visualize relationships between weather attributes, such as temperature
and humidity, using correlation plots or heatmaps.
In [ ]:
In [10]:
import requests
import pandas as pd
import datetime
#default api_key = a4ddc1eec173ae7ce62ad20c1d4c51f3
# api url = f"http://api.openweathermap.org/data/2.5/weather?q=London,uk&APPID=a4ddc1eec173ae7ce62ad20c1d4c51f3
# Set your OpenWeatherMap API key
#default api key = a16fb0accb4b396705eef57c4fa5f0ca
# api url demo = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}"
api_key = 'fb365aa6104829b44455572365ff3b4e'
# Set the location for which you want to retrieve weather data
lat = 18.184135
lon = 74.610764
# https://openweathermap.org/api/one-call-3
# how How to use api call
# Construct the API URL
api_url = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}"
# Send a GET request to the API
response = requests.get(api_url)
weather_data = response.json()
weather_data.keys()
len(weather_data['list'])
weather_data['list'][0]['weather'][0]['description']
#Getting the data from dictionary and taking into one variable
# Extract relevant weather attributes using list comprehension
temperatures = [item['main']['temp'] for item in weather_data['list']]
# It will extract all values (40) and putting into one variable
timestamps = [pd.to_datetime(item['dt'], unit='s') for item in weather_data['list']]
temperature = [item['main']['temp'] for item in weather_data['list']]
humidity = [item['main']['humidity'] for item in weather_data['list']]
wind_speed = [item['wind']['speed'] for item in weather_data['list']]
weather_description = [item['weather'][0]['description'] for item in weather_data['list']]
# Create a pandas DataFrame with the extracted weather data
weather_df = pd.DataFrame({'Timestamp': timestamps,
'Temperature': temperatures,
'humidity': humidity,
'wind_speed':wind_speed,
'weather_description': weather_description})
# Set the Timestamp column as the DataFrame's index
weather_df.set_index('Timestamp', inplace=True)
max_temp = weather_df['Temperature'].max()
print(f"Maximum Temperature - {max_temp}")
min_temp = weather_df['Temperature'].min()
print(f"Minimum Temperature - {min_temp}")
# Clean and preprocess the data # Handling missing values
weather_df.fillna(0, inplace=True) # Replace missing values with 0 or appropriate value
# Handling inconsistent format (if applicable)
weather_df['Temperature'] = weather_df['Temperature'].apply(lambda x: x - 273.15 if isinstance(x, float)else x)
# Convert temperature from Kelvin to Celsius
# Print the cleaned and preprocessed data print(weather_df)
weather_df.head()
import matplotlib.pyplot as plt
daily_mean_temp = weather_df['Temperature'].resample('D').mean()
daily_mean_humidity = weather_df['humidity'].resample('D').mean()
daily_mean_wind_speed = weather_df['wind_speed'].resample('D').mean()
# Plot the mean daily temperature over time (Line plot)
plt.figure(figsize=(10, 6))
daily_mean_temp.plot(color='red', linestyle='-', marker='o')
plt.title('Mean Daily Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.show()
# Plot the mean daily humidity over time (Bar plot)
plt.figure(figsize=(10, 6))
daily_mean_humidity.plot(kind='bar', color='blue')
plt.title('Mean Daily Humidity')
plt.xlabel('Date')
plt.ylabel('Humidity (%)')
plt.grid(True)
plt.show()
# Plot the relationship between temperature and wind speed (Scatter plot)
plt.figure(figsize=(10, 6))
plt.scatter(weather_df['Temperature'], weather_df['wind_speed'], color='green')
plt.title('Temperature vs. Wind Speed')
plt.xlabel('Temperature (°C)')
plt.ylabel('Wind Speed (m/s)')
plt.grid(True)
plt.show()
# Heatmap
import seaborn as sns
heatmap_data = weather_df[['Temperature', 'humidity']]
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm')
plt.title('Temperature vs Humidity Heatmap')
plt.show()
# Create a scatter plot to visualize the relationship between temperature and humidity
plt.scatter(weather_df['Temperature'], weather_df['humidity'])
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.title('Temperature vs Humidity Scatter Plot')
plt.show()
Maximum Temperature - 305.17 Minimum Temperature - 293.16
In [ ]:
In [ ]:
#OR
In [ ]:
pip install folium
In [7]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import folium
import seaborn as sns
# Step 1: Register and obtain API key from OpenWeatherMap
api_key = "fb365aa6104829b44455572365ff3b4e"
# Step 2: Interact with the OpenWeatherMap API
base_url = "http://api.openweathermap.org/data/2.5/forecast"
city_name = "New York"
params = {
"q": city_name,
"appid": api_key,
"units": "metric"
}
response = requests.get(base_url, params=params)
data = response.json()
response
data
# Extract relevant weather attributes and timestamps
weather_data = data["list"]
timestamps = [entry["dt"] for entry in weather_data]
temperatures = [entry["main"]["temp"] for entry in weather_data]
humidity = [entry["main"]["humidity"] for entry in weather_data]
wind_speed = [entry["wind"]["speed"] for entry in weather_data]
weather_description = [item['weather'][0]['description'] for item in weather_data]
weather_data
# Convert timestamps to pandas datetime objects
datetime_objects = [pd.to_datetime(timestamp, unit="s") for timestamp in timestamps]
# Create a pandas DataFrame with the extracted weather data
weather_df = pd.DataFrame({
'Timestamp': timestamps,
'Datetime': datetime_objects,
'Temperature': temperatures,
'Humidity': humidity,
'Wind Speed': wind_speed,
'Weather Description': weather_description,
})
# Convert the "Datetime" column to a DatetimeIndex
weather_df.set_index("Datetime", inplace=True)
weather_df
# Calculate total precipitation (rain + snow) for each entry
precipitation = []
for entry in weather_data:
rain = entry.get("rain", {}).get("3h", 0)
snow = entry.get("snow", {}).get("3h", 0)
total_precipitation = rain + snow
precipitation.append(total_precipitation)
precipitation
# Step 5: Perform data modeling
avg_temp = sum(temperatures) / len(temperatures)
max_temp = max(temperatures)
min_temp = min(temperatures)
# Plot the mean daily temperature over time (Line plot)
daily_mean_temp = weather_df['Temperature'].resample('D').mean()
daily_mean_humidity = weather_df['Humidity'].resample('D').mean()
daily_mean_wind_speed = weather_df['Wind Speed'].resample('D').mean()
plt.figure(figsize=(10, 6))
daily_mean_temp.plot(color='red', linestyle='-', marker='o')
plt.title('Mean Daily Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.show()
# Plot the mean daily humidity over time (Bar plot)
plt.figure(figsize=(10, 6))
daily_mean_humidity.plot(kind='bar', color='blue')
plt.title('Mean Daily Humidity')
plt.xlabel('Date')
plt.ylabel('Humidity (%)')
plt.grid(True)
plt.show()
# Plot the relationship between temperature and wind speed (Scatter plot)
plt.figure(figsize=(10, 6))
plt.scatter(weather_df['Temperature'], weather_df['Wind Speed'], color='green')
plt.title('Temperature vs. Wind Speed')
plt.xlabel('Temperature (°C)')
plt.ylabel('Wind Speed (m/s)')
plt.grid(True)
plt.show()
# Step 6: Visualize the weather data
plt.figure(figsize=(10, 6))
plt.plot(timestamps, temperatures, label="Temperature (°C)")
plt.plot(timestamps, precipitation, label="Precipitation")
plt.xlabel("Timestamp")
plt.ylabel("Value")
plt.title(f"Weather Data for {city_name}")
plt.legend()
plt.show()
# Create a heatmap of temperature vs humidity
heatmap_data = weather_df[['Temperature', 'Humidity']]
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm')
plt.title('Temperature vs Humidity Heatmap')
plt.show()
# Scatter plot to visualize the relationship between temperature and humidity
plt.scatter(weather_df['Temperature'], weather_df['Humidity'])
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.title('Temperature vs Humidity Scatter Plot')
plt.show()
# Step 7: Retrieve weather data for multiple locations
locations = ['London', 'Paris', 'New York']
multi_location_df = pd.DataFrame()
for location in locations:
api_url = f'http://api.openweathermap.org/data/2.5/weather?q={location}&appid={api_key}'
response = requests.get(api_url)
data = response.json()
# Extract relevant weather attributes
temperature = data['main']['temp']
humidity = data['main']['humidity']
wind_speed = data['wind']['speed']
latitude = data.get('coord', {}).get('lat', None)
longitude = data.get('coord', {}).get('lon', None)
# Create a DataFrame for the current location's weather data
location_df = pd.DataFrame({
'Location': [location],
'Temperature': [temperature],
'Humidity': [humidity],
'Wind Speed': [wind_speed],
'Latitude': [latitude],
'Longitude': [longitude]
})
# Append the current location's weather data to the main DataFrame
multi_location_df = pd.concat([multi_location_df, location_df], ignore_index=True)
# Display the DataFrame with weather data for multiple locations
print(multi_location_df)
# Step 8: Geospatial Visualization with Folium
# Center the map around New York (or the first location)
latitude = multi_location_df.iloc[0]['Latitude']
longitude = multi_location_df.iloc[0]['Longitude']
weather_map = folium.Map(location=[latitude, longitude], zoom_start=4)
# Add markers for each location in the DataFrame
for index, row in multi_location_df.iterrows():
location_name = row['Location']
latitude = row['Latitude']
longitude = row['Longitude']
folium.Marker([latitude, longitude], popup=location_name).add_to(weather_map)
# Save the map as an HTML file
weather_map.save("weather_map.html")
print("Map saved as 'weather_map.html'.")
Location Temperature Humidity Wind Speed Latitude Longitude 0 London 284.68 83 4.12 51.5085 -0.1257 1 Paris 282.20 90 4.12 48.8534 2.3488 2 New York 285.31 34 4.12 40.7143 -74.0060 Map saved as 'weather_map.html'.
In [ ]:
In [ ]:
#OR
In [22]:
#Own code
import requests
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
# Set your OpenWeatherMap API key
api_key = 'a4ddc1eec173ae7ce62ad20c1d4c51f3' # Replace with your actual API key
lat = 18.184135 # Latitude of the location
lon = 74.610764 # Longitude of the location
# API URL for forecast data
api_url = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}"
# Send a GET request to the API
response = requests.get(api_url)
weather_data = response.json()
# Ensure 'list' exists in the response
if 'list' in weather_data:
# Extract relevant weather attributes using list comprehension
temperatures = [item['main']['temp'] - 273.15 for item in weather_data['list']] # Convert from Kelvin to Celsius
timestamps = [pd.to_datetime(item['dt'], unit='s') for item in weather_data['list']]
humidity = [item['main']['humidity'] for item in weather_data['list']]
wind_speed = [item['wind']['speed'] for item in weather_data['list']]
weather_description = [item['weather'][0]['description'] for item in weather_data['list']]
# Create a pandas DataFrame
weather_df = pd.DataFrame({
'Timestamp': timestamps,
'Temperature': temperatures,
'Humidity': humidity,
'Wind Speed': wind_speed,
'Weather Description': weather_description
})
# Set the Timestamp column as the DataFrame's index
weather_df.set_index('Timestamp', inplace=True)
# Print the cleaned DataFrame
print(weather_df.head())
# Calculate max and min temperatures
max_temp = weather_df['Temperature'].max()
print(f"Maximum Temperature: {max_temp:.2f}°C")
min_temp = weather_df['Temperature'].min()
print(f"Minimum Temperature: {min_temp:.2f}°C")
# Clean and preprocess the data
weather_df.fillna(0, inplace=True) # Replace missing values with 0 or appropriate value
# Plot the mean daily temperature over time (Line plot)
daily_mean_temp = weather_df['Temperature'].resample('D').mean()
plt.figure(figsize=(10, 6))
daily_mean_temp.plot(color='red', linestyle='-', marker='o')
plt.title('Mean Daily Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.show()
# Plot the mean daily humidity over time (Bar plot)
daily_mean_humidity = weather_df['Humidity'].resample('D').mean()
plt.figure(figsize=(10, 6))
daily_mean_humidity.plot(kind='bar', color='blue')
plt.title('Mean Daily Humidity')
plt.xlabel('Date')
plt.ylabel('Humidity (%)')
plt.grid(True)
plt.show()
# Plot the relationship between temperature and wind speed (Scatter plot)
plt.figure(figsize=(10, 6))
plt.scatter(weather_df['Temperature'], weather_df['Wind Speed'], color='green')
plt.title('Temperature vs. Wind Speed')
plt.xlabel('Temperature (°C)')
plt.ylabel('Wind Speed (m/s)')
plt.grid(True)
plt.show()
# Heatmap of Temperature vs Humidity
heatmap_data = weather_df[['Temperature', 'Humidity']]
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm')
plt.title('Temperature vs Humidity Heatmap')
plt.show()
# Scatter plot to visualize the relationship between temperature and humidity
plt.figure(figsize=(10, 6))
plt.scatter(weather_df['Temperature'], weather_df['Humidity'])
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.title('Temperature vs Humidity Scatter Plot')
plt.grid(True)
plt.show()
else:
print("No weather data available.")
Temperature Humidity Wind Speed Weather Description Timestamp 2024-11-04 18:00:00 24.47 49 3.18 broken clouds 2024-11-04 21:00:00 23.93 51 2.52 scattered clouds 2024-11-05 00:00:00 22.40 55 1.89 scattered clouds 2024-11-05 03:00:00 24.66 50 2.54 clear sky 2024-11-05 06:00:00 29.46 37 3.93 clear sky Maximum Temperature: 31.40°C Minimum Temperature: 20.29°C
In [ ]:
In [ ]:
In [ ]:
In [ ]:
'''Pract 9 - 9 Data Cleaning and Preparation
Problem Statement: Analyzing Customer Churn in a Telecommunications Company
Dataset: "Telecom_Customer_Churn.csv"
Description: The dataset contains information about customers of a telecommunications
company and whether they have churned (i.e., discontinued their services). The dataset
includes various attributes of the customers, such as their demographics, usage patterns, and
account information. The goal is to perform data cleaning and preparation to gain insights
into the factors that contribute to customer churn.
Tasks to Perform:
1. Import the "Telecom_Customer_Churn.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Handle missing values in the dataset, deciding on an appropriate strategy.
4. Remove any duplicate records from the dataset.
5. Check for inconsistent data, such as inconsistent formatting or spelling variations,
and standardize it.
6. Convert columns to the correct data types as needed.
7. Identify and handle outliers in the data.
8. Perform feature engineering, creating new features that may be relevant to
predicting customer churn.
9. Normalize or scale the data if necessary.
Faculty of Science and Technology Savitribai Phule Pune University
Syllabus for Fourth Year of Artificial Intelligence and Data Science (2020 Course) 47/126
10. Split the dataset into training and testing sets for further analysis.
11. Export the cleaned dataset for future analysis or modeling.
In [ ]:
In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("Telco-Customer-Churn.csv")
print(data.index)
data.head()
print(data.columns)
data.shape
data.nunique()
data.isna().sum()
data.isnull().sum()
# Check the number of rows before removing duplicates
print("Number of rows before removing duplicates:", len(data))
# Remove duplicate records
data_cleaned = data.drop_duplicates()
# Remove duplicate records
data_cleaned = data.drop_duplicates()
data.describe()
# Measure of frequency destribution
unique, counts = np.unique(data['tenure'], return_counts=True)
print(unique, counts)
# Measure of frequency destribution
unique, counts = np.unique(data['MonthlyCharges'], return_counts=True)
print(unique, counts)
# Measure of frequency destribution
unique, counts = np.unique(data['TotalCharges'], return_counts=True)
print(unique, counts)
sns.pairplot(data)
plt.boxplot(data['tenure'])
plt.show()
plt.boxplot(data['MonthlyCharges'])
plt.show()
X = data.drop("Churn", axis=1)
y = data["Churn"]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape
y_train.shape
X_test.shape
y_test.shape
# Export the cleaned dataset to a CSV file
data.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
RangeIndex(start=0, stop=7043, step=1)
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
Number of rows before removing duplicates: 7043
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
72] [ 11 613 238 200 176 133 110 131 123 119 116 99 117 109 76 99 80 87
97 73 71 63 90 85 94 79 79 72 57 72 72 65 69 64 65 88
50 65 59 56 64 70 65 65 51 61 74 68 64 66 68 68 80 70
68 64 80 65 67 60 76 76 70 72 80 76 89 98 100 95 119 170
362]
[ 18.25 18.4 18.55 ... 118.6 118.65 118.75] [1 1 1 ... 2 1 1]
[' ' '100.2' '100.25' ... '999.45' '999.8' '999.9'] [11 1 1 ... 1 1 1]
In [ ]:
In [ ]:
#OR
In [15]:
import pandas as pd #data manipulation
import numpy as np #numerical computations
from sklearn.model_selection import train_test_split # scikit-learn fo
from sklearn import metrics #evaluating the performance of
data = pd.read_csv("Telco-Customer-Churn.csv")
print(data.index)
print(data)
print(data.columns)
data.shape
print(data.head())
print(data.tail())
data.nunique()
# data.isna().sum() is used to count the number of missing values (NaN values) in e
data.isna().sum()
# isna() and isnull() are essentially the same method in Pandas, and they both retu
data.isnull().sum()
# Check the number of rows before removing duplicates
print("Number of rows before removing duplicates:", len(data))
# Remove duplicate records
data_cleaned = data.drop_duplicates()
# Check the number of rows after removing duplicates
print("Number of rows after removing duplicates:", len(data_cleaned))
data.describe()
#Measure of frequency destribution
unique, counts = np.unique(data['tenure'], return_counts=True)
print(unique, counts)
#Measure of frequency destribution
unique, counts = np.unique(data['MonthlyCharges'], return_counts=True)
print(unique, counts)
#Measure of frequency destribution
unique, counts = np.unique(data['TotalCharges'], return_counts=True)
print(unique, counts)
# sns.pairplot(data) creates a grid of pairwise plots of the variables in a dataset
import seaborn as sns #Seaborn library for data visualization
sns.pairplot(data)
#checking boxplot for Fare column
import matplotlib.pyplot as plt #pyplot module from the Matplotlib lib
plt.boxplot(data['tenure'])
plt.show()
plt.boxplot(data['MonthlyCharges'])
plt.show()
X = data.drop("Churn", axis=1)
y = data["Churn"]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Export the cleaned data
X_train.shape
y_train.shape
X_test.shape
y_test.shape
# Export the cleaned dataset to a CSV file
data.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
RangeIndex(start=0, stop=7043, step=1)
customerID gender SeniorCitizen Partner Dependents tenure \
0 7590-VHVEG Female 0 Yes No 1
1 5575-GNVDE Male 0 No No 34
2 3668-QPYBK Male 0 No No 2
3 7795-CFOCW Male 0 No No 45
4 9237-HQITU Female 0 No No 2
... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24
7039 2234-XADUH Female 0 Yes Yes 72
7040 4801-JZAZL Female 0 Yes Yes 11
7041 8361-LTMKD Male 1 Yes No 4
7042 3186-AJIEK Male 0 No No 66
PhoneService MultipleLines InternetService OnlineSecurity ... \
0 No No phone service DSL No ...
1 Yes No DSL Yes ...
2 Yes No DSL Yes ...
3 No No phone service DSL Yes ...
4 Yes No Fiber optic No ...
... ... ... ... ... ...
7038 Yes Yes DSL Yes ...
7039 Yes Yes Fiber optic No ...
7040 No No phone service DSL Yes ...
7041 Yes Yes Fiber optic No ...
7042 Yes No Fiber optic Yes ...
DeviceProtection TechSupport StreamingTV StreamingMovies Contract \
0 No No No No Month-to-month
1 Yes No No No One year
2 No No No No Month-to-month
3 Yes Yes No No One year
4 No No No No Month-to-month
... ... ... ... ... ...
7038 Yes Yes Yes Yes One year
7039 Yes No Yes Yes One year
7040 No No No No Month-to-month
7041 No No No No Month-to-month
7042 Yes Yes Yes Yes Two year
PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \
0 Yes Electronic check 29.85 29.85
1 No Mailed check 56.95 1889.5
2 Yes Mailed check 53.85 108.15
3 No Bank transfer (automatic) 42.30 1840.75
4 Yes Electronic check 70.70 151.65
... ... ... ... ...
7038 Yes Mailed check 84.80 1990.5
7039 Yes Credit card (automatic) 103.20 7362.9
7040 Yes Electronic check 29.60 346.45
7041 Yes Mailed check 74.40 306.6
7042 Yes Bank transfer (automatic) 105.65 6844.5
Churn
0 No
1 No
2 Yes
3 No
4 Yes
... ...
7038 No
7039 No
7040 No
7041 Yes
7042 No
[7043 rows x 21 columns]
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
customerID gender SeniorCitizen Partner Dependents tenure PhoneService \
0 7590-VHVEG Female 0 Yes No 1 No
1 5575-GNVDE Male 0 No No 34 Yes
2 3668-QPYBK Male 0 No No 2 Yes
3 7795-CFOCW Male 0 No No 45 No
4 9237-HQITU Female 0 No No 2 Yes
MultipleLines InternetService OnlineSecurity ... DeviceProtection \
0 No phone service DSL No ... No
1 No DSL Yes ... Yes
2 No DSL Yes ... No
3 No phone service DSL Yes ... Yes
4 No Fiber optic No ... No
TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \
0 No No No Month-to-month Yes
1 No No No One year No
2 No No No Month-to-month Yes
3 Yes No No One year No
4 No No No Month-to-month Yes
PaymentMethod MonthlyCharges TotalCharges Churn
0 Electronic check 29.85 29.85 No
1 Mailed check 56.95 1889.5 No
2 Mailed check 53.85 108.15 Yes
3 Bank transfer (automatic) 42.30 1840.75 No
4 Electronic check 70.70 151.65 Yes
[5 rows x 21 columns]
customerID gender SeniorCitizen Partner Dependents tenure \
7038 6840-RESVB Male 0 Yes Yes 24
7039 2234-XADUH Female 0 Yes Yes 72
7040 4801-JZAZL Female 0 Yes Yes 11
7041 8361-LTMKD Male 1 Yes No 4
7042 3186-AJIEK Male 0 No No 66
PhoneService MultipleLines InternetService OnlineSecurity ... \
7038 Yes Yes DSL Yes ...
7039 Yes Yes Fiber optic No ...
7040 No No phone service DSL Yes ...
7041 Yes Yes Fiber optic No ...
7042 Yes No Fiber optic Yes ...
DeviceProtection TechSupport StreamingTV StreamingMovies Contract \
7038 Yes Yes Yes Yes One year
7039 Yes No Yes Yes One year
7040 No No No No Month-to-month
7041 No No No No Month-to-month
7042 Yes Yes Yes Yes Two year
PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \
7038 Yes Mailed check 84.80 1990.5
7039 Yes Credit card (automatic) 103.20 7362.9
7040 Yes Electronic check 29.60 346.45
7041 Yes Mailed check 74.40 306.6
7042 Yes Bank transfer (automatic) 105.65 6844.5
Churn
7038 No
7039 No
7040 No
7041 Yes
7042 No
[5 rows x 21 columns]
Number of rows before removing duplicates: 7043
Number of rows after removing duplicates: 7043
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
72] [ 11 613 238 200 176 133 110 131 123 119 116 99 117 109 76 99 80 87
97 73 71 63 90 85 94 79 79 72 57 72 72 65 69 64 65 88
50 65 59 56 64 70 65 65 51 61 74 68 64 66 68 68 80 70
68 64 80 65 67 60 76 76 70 72 80 76 89 98 100 95 119 170
362]
[ 18.25 18.4 18.55 ... 118.6 118.65 118.75] [1 1 1 ... 2 1 1]
[' ' '100.2' '100.25' ... '999.45' '999.8' '999.9'] [11 1 1 ... 1 1 1]
In [ ]:
In [ ]:
'''Pract 10 - 10 Data Wrangling
Problem Statement: Data Wrangling on Real Estate Market
Dataset: "RealEstate_Prices.csv"
Description: The dataset contains information about housing prices in a specific real estate
market. It includes various attributes such as property characteristics, location, sale prices,
and other relevant features. The goal is to perform data wrangling to gain insights into the
factors influencing housing prices and prepare the dataset for further analysis or modeling.
Tasks to Perform:
1. Import the "RealEstate_Prices.csv" dataset. Clean column names by removing spaces,
special characters, or renaming them for clarity.
2. Handle missing values in the dataset, deciding on an appropriate strategy (e.g.,
imputation or removal).
3. Perform data merging if additional datasets with relevant information are available
(e.g., neighborhood demographics or nearby amenities).
4. Filter and subset the data based on specific criteria, such as a particular time period,
property type, or location.
5. Handle categorical variables by encoding them appropriately (e.g., one-hot encoding or
label encoding) for further analysis.
6. Aggregate the data to calculate summary statistics or derived metrics such as average
sale prices by neighborhood or property type.
7. Identify and handle outliers or extreme values in the data that may affect the analysis
or modeling process
In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings
# Supressing update warnings
warnings.filterwarnings('ignore')
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()
df1.shape
df1.columns
df1['area_type']
df1['area_type'].unique()
df1['area_type'].value_counts()
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.shape
df2.isnull().sum()
df2.shape
df3 = df2.dropna()
df3.isnull().sum()
df3.shape
df3['size'].unique()
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.head()
df3.bhk.unique()
df3[df3.bhk>20]
df3.total_sqft.unique()
def is_float(x):
try:
float(x)
return True
except(ValueError, TypeError):
return False
df3[~df3['total_sqft'].apply(is_float)].head(10)
def convert_sqft_to_num(x):
tokens = x.split('-')
if len(tokens) == 2:
try:
return (float(tokens[0])+float(tokens[1]))/2
except ValueError:
return None
try:
return float(x)
except ValueError:
return None
result = convert_sqft_to_num('2100 - 2850')
print(result)
convert_sqft_to_num('34.46Sq. Meter')
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4
df4 = df4[df4.total_sqft.notnull()]
df4
df4.loc[30]
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()
df5_stats = df5['price_per_sqft'].describe()
df5_stats
df5.to_csv("bhp.csv",index=False)
df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending=False)
location_stats
len(location_stats[location_stats>10])
len(location_stats)
len(location_stats[location_stats<=10])
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10
len(df5.location.unique())
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())
df5.head(10)
df5[df5.total_sqft/df5.bhk<300].head()
df5.shape
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape
df6.columns
plt.boxplot(df6['total_sqft'])
plt.show()
Q1 = np.percentile(df6['total_sqft'], 25.) # 25th percentile of the data of the given feature
Q3 = np.percentile(df6['total_sqft'], 75.) # 75th percentile of the data of the given feature
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR)
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['total_sqft'] > ul].index.tolist()
lower_outliers = df6[df6['total_sqft'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace = True, errors = 'ignore')
plt.boxplot(df6['bath'])
plt.show()
Q1 = np.percentile(df6['bath'], 25.) # 25th percentile of the data of the given feature
Q3 = np.percentile(df6['bath'], 75.) # 75th percentile of the data of the given feature
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR)
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['bath'] > ul].index.tolist()
lower_outliers = df6[df6['bath'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace = True, errors = 'ignore')
plt.boxplot(df6['price'])
plt.show()
Q1 = np.percentile(df6['price'], 25.) # 25th percentile of the data of the given feature
Q3 = np.percentile(df6['price'], 75.) # 75th percentile of the data of the given feature
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR)
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['price'] > ul].index.tolist()
lower_outliers = df6[df6['price'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace = True, errors = 'ignore')
plt.boxplot(df6['bhk'])
plt.show()
Q1 = np.percentile(df6['bhk'], 25.) # 25th percentile of the data of the given feature
Q3 = np.percentile(df6['bhk'], 75.) # 75th percentile of the data of the given feature
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR)
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['bhk'] > ul].index.tolist()
lower_outliers = df6[df6['bhk'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace = True, errors = 'ignore')
plt.boxplot(df6['price_per_sqft'])
plt.show()
Q1 = np.percentile(df6['price_per_sqft'], 25.) # 25th percentile of the data of the given feature
Q3 = np.percentile(df6['price_per_sqft'], 75.) # 75th percentile of the data of the given feature
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR)
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['price_per_sqft'] > ul].index.tolist()
lower_outliers = df6[df6['price_per_sqft'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace = True, errors = 'ignore')
plt.boxplot(df6['price_per_sqft'])
plt.show()
df6.shape
X = df6.drop(['price'],axis='columns')
X.head(3)
X.shape
y = df6.price
y.head(3)
len(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
X_train.shape
y_train.shape
X_test.shape
y_test.shape
2475.0
Out[4]:
(2018,)
In [ ]:
In [ ]:
#OR
In [7]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
# Load and inspect data
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()
df1.shape
df1.columns
df1['area_type']
df1['area_type'].unique()
df1['area_type'].value_counts()
# Drop unnecessary columns
df2 = df1.drop(['area_type','society','balcony','availability'], axis='columns')
df2.shape
# Check for null values
df2.isnull().sum()
df2.shape
# Drop rows with null values
df3 = df2.dropna()
df3.isnull().sum()
df3.shape
# Feature engineering
df3['size'].unique()
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.head()
df3.bhk.unique()
df3[df3.bhk > 20]
df3.total_sqft.unique()
# Explore total_sqft feature
def is_float(x):
try:
float(x)
return True
except:
return False
df3[~df3['total_sqft'].apply(is_float)].head(10)
def convert_sqft_to_num(x):
tokens = x.split('-')
if len(tokens) == 2:
return (float(tokens[0]) + float(tokens[1])) / 2
try:
return float(x)
except:
return None
convert_sqft_to_num('2100 - 2850')
convert_sqft_to_num('34.46Sq. Meter')
# Apply conversion function
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.loc[30]
# Adding price per square foot
df5 = df4.copy()
df5['price_per_sqft'] = df5['price'] * 100000 / df5['total_sqft']
df5.head()
# Statistics of price per square foot
df5_stats = df5['price_per_sqft'].describe()
df5_stats
df5.to_csv("bhp.csv", index=False)
# Unique locations count
len(df5.location.unique())
df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending=False)
location_stats
len(location_stats[location_stats > 10])
len(location_stats)
len(location_stats[location_stats <= 10])
# Dimensionality reduction
location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10
len(df5.location.unique())
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())
# Outlier Removal Using Business Logic
df5[df5.total_sqft / df5.bhk < 300].head()
df5.shape
df6 = df5[~(df5.total_sqft / df5.bhk < 300)]
df6.shape
df6.columns
# Boxplot for total_sqft
plt.boxplot(df6['total_sqft'])
plt.show()
# IQR method for outlier detection and removal
Q1 = np.percentile(df6['total_sqft'], 25)
Q3 = np.percentile(df6['total_sqft'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['total_sqft'] > ul].index.tolist()
lower_outliers = df6[df6['total_sqft'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace=True, errors='ignore')
# Boxplot for bath
plt.boxplot(df6['bath'])
plt.show()
Q1 = np.percentile(df6['bath'], 25)
Q3 = np.percentile(df6['bath'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['bath'] > ul].index.tolist()
lower_outliers = df6[df6['bath'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace=True, errors='ignore')
# Boxplot for price
plt.boxplot(df6['price'])
plt.show()
Q1 = np.percentile(df6['price'], 25)
Q3 = np.percentile(df6['price'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['price'] > ul].index.tolist()
lower_outliers = df6[df6['price'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace=True, errors='ignore')
# Boxplot for bhk
plt.boxplot(df6['bhk'])
plt.show()
Q1 = np.percentile(df6['bhk'], 25)
Q3 = np.percentile(df6['bhk'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['bhk'] > ul].index.tolist()
lower_outliers = df6[df6['bhk'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace=True, errors='ignore')
# Boxplot for price_per_sqft
plt.boxplot(df6['price_per_sqft'])
plt.show()
Q1 = np.percentile(df6['price_per_sqft'], 25)
Q3 = np.percentile(df6['price_per_sqft'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['price_per_sqft'] > ul].index.tolist()
lower_outliers = df6[df6['price_per_sqft'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
df6.drop(bad_indices, inplace=True, errors='ignore')
df6.shape
# Splitting data into features and target variable
X = df6.drop(['price'], axis='columns')
X.head(3)
X.shape
y = df6.price
y.head(3)
len(y)
# Splitting dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape
y_train.shape
X_test.shape
y_test.shape
Out[7]:
(2018,)
In [ ]:
In [ ]:
'''pract11 - Data Visualization using matplotlib
Problem Statement: Analyzing Air Quality Index (AQI) Trends in a City
Dataset: "City_Air_Quality.csv"
Description: The dataset contains information about air quality measurements in a specific
city over a period of time. It includes attributes such as date, time, pollutant levels (e.g., PM2.5,
PM10, CO), and the Air Quality Index (AQI) values. The goal is to use the matplotlib library
to create visualizations that effectively represent the AQI trends and patterns for different
pollutants in the city.
Tasks to Perform:
1. Import the "City_Air_Quality.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Identify the relevant variables for visualizing AQI trends, such as date, pollutant levels,
and AQI values.
4. Create line plots or time series plots to visualize the overall AQI trend over time.
5. Plot individual pollutant levels (e.g., PM2.5, PM10, CO) on separate line plots to
visualize their trends over time.
6. Use bar plots or stacked bar plots to compare the AQI values across different dates or
time periods.
7. Create box plots or violin plots to analyze the distribution of AQI values for different
pollutant categories.
8. Use scatter plots or bubble charts to explore the relationship between AQI values and
pollutant levels.
9. Customize the visualizations by adding labels, titles, legends, and appropriate color
schemes
In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings
# Suppressing warnings
warnings.filterwarnings('ignore')
# Load the data
data = pd.read_csv("India Air Quality Data.csv", encoding="cp1252")
data.info()
# Clean up state name changes
data['state'] = data['state'].replace({'Uttaranchal':'Uttarakhand'})
data.loc[data['location'] == "Jamshedpur", 'state'] = 'Jharkhand'
# Uniform format for type column
types = {
"Residential": "R",
"Residential and others": "RO",
"Residential, Rural and other Areas": "RRO",
"Industrial Area": "I",
"Industrial Areas": "I",
"Industrial": "I",
"Sensitive Area": "S",
"Sensitive Areas": "S",
"Sensitive": "S",
np.nan: "RRO"
}
data['type'] = data['type'].replace(types)
# Columns of interest
VALUE_COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']
# Impute missing values with mean for selected columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data[VALUE_COLS] = imputer.fit_transform(data[VALUE_COLS])
# Check for any remaining null values
print(data.isnull().sum())
# Plotting top 10 and bottom 10 states by pollution level
def top_and_bottom_10_states(indicator="so2"):
fig, ax = plt.subplots(2, 1, figsize=(20, 12))
ind = data[[indicator, 'state']].groupby('state', as_index=False).median().sort_values(by=indicator, ascending=False)
# Top 10 states
sns.barplot(x='state', y=indicator, data=ind[:10], ax=ax[0], color='red')
ax[0].set_title(f"Top 10 states by {indicator} (1991-2016)")
ax[0].set_ylabel(f"{indicator} (µg/m3)")
ax[0].set_xlabel("State")
# Bottom 10 states
sns.barplot(x='state', y=indicator, data=ind[-10:], ax=ax[1], color='green')
ax[1].set_title(f"Bottom 10 states by {indicator} (1991-2016)")
ax[1].set_ylabel(f"{indicator} (µg/m3)")
ax[1].set_xlabel("State")
top_and_bottom_10_states("so2")
top_and_bottom_10_states("no2")
# Plotting the highest recorded levels for each state
def highest_levels_recorded(indicator="so2"):
plt.figure(figsize=(20, 10))
# Ensure the indicator column is numeric
data[indicator] = pd.to_numeric(data[indicator], errors='coerce')
# Drop rows with NaN in the indicator column before aggregation
filtered_data = data.dropna(subset=[indicator])
# Group by state and find max for the indicator
ind = filtered_data.groupby('state', as_index=False).agg({indicator: 'max'})
# Plotting
sns.barplot(x='state', y=indicator, data=ind)
plt.title(f"Highest ever {indicator} levels recorded by state")
plt.xticks(rotation=90)
highest_levels_recorded("no2")
highest_levels_recorded("rspm")
# Plot pollutant averages by type
def type_avg(indicator=""):
type_avg_data = data[VALUE_COLS + ['type']].groupby("type").mean()
if indicator:
type_avg_data[indicator].plot(kind='bar')
plt.title(f"Pollutant average by type for {indicator}")
else:
type_avg_data.plot(kind='bar')
plt.title("Pollutant average by type")
plt.xticks(rotation=0)
plt.show()
type_avg('so2')
# Plot pollutant averages by location within a state
def location_avgs(state, indicator="so2"):
locs = data[VALUE_COLS + ['state', 'location']].groupby(['state', 'location']).mean()
state_avgs = locs.loc[state].reset_index()
plt.figure(figsize=(15, 8))
sns.barplot(x='location', y=indicator, data=state_avgs)
plt.title(f"Location-wise average for {indicator} in {state}")
plt.xticks(rotation=90)
plt.show()
location_avgs("Bihar", "no2")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 435742 entries, 0 to 435741 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 stn_code 291665 non-null object 1 sampling_date 435739 non-null object 2 state 435742 non-null object 3 location 435739 non-null object 4 agency 286261 non-null object 5 type 430349 non-null object 6 so2 401096 non-null float64 7 no2 419509 non-null float64 8 rspm 395520 non-null float64 9 spm 198355 non-null float64 10 location_monitoring_station 408251 non-null object 11 pm2_5 9314 non-null float64 12 date 435735 non-null object dtypes: float64(5), object(8) memory usage: 43.2+ MB stn_code 144077 sampling_date 3 state 0 location 3 agency 149481 type 0 so2 0 no2 0 rspm 0 spm 0 location_monitoring_station 27491 pm2_5 0 date 7 dtype: int64
In [ ]:
In [ ]:
#OR
In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings
%matplotlib inline
# Load the dataset
aqi = pd.read_csv("India Air Quality Data.csv", encoding="ISO-8859-1", parse_dates=['date'])
# Check the first few rows of the dataset
aqi.head()
# Set the style for the plots
sns.set(style="ticks", rc={'figure.figsize': (20, 15)})
# Suppressing update warnings
warnings.filterwarnings('ignore')
# Checking the original dataset for null values and its shape
print(aqi.isnull().sum())
print(aqi.shape)
aqi.info()
# Cleaning up the data
aqi.drop(['stn_code', 'agency', 'sampling_date', 'location_monitoring_station'], axis=1, inplace=True)
# Dropping rows where no date is available
aqi = aqi.dropna(subset=['date'])
# Cleaning up name changes
aqi.state = aqi.state.replace({'Uttaranchal': 'Uttarakhand'})
aqi.loc[aqi.location == "Jamshedpur", 'state'] = 'Jharkhand'
# Changing types to a uniform format
types = {
"Residential": "R",
"Residential and others": "RO",
"Residential, Rural and other Areas": "RRO",
"Industrial Area": "I",
"Industrial Areas": "I",
"Industrial": "I",
"Sensitive Area": "S",
"Sensitive Areas": "S",
"Sensitive": "S",
np.nan: "RRO"
}
aqi['type'] = aqi['type'].replace(types)
# Display the cleaned dataset
aqi.head()
# Defining columns of importance that will be used regularly
VALUE_COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']
# Invoking SimpleImputer to fill missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
aqi[VALUE_COLS] = imputer.fit_transform(aqi[VALUE_COLS])
# Checking to see if the dataset has any null values left
print(aqi.isnull().sum())
aqi.tail()
# Defining a function that plots SO2, NO2, RSPM, and SPM yearly average levels
def plot_for_state(state):
fig, ax = plt.subplots(2, 2, figsize=(20, 12))
fig.suptitle(state, size=20)
# Filter data for the specific state and set date as index
state_data = aqi[aqi.state == state].copy()
state_data.set_index('date', inplace=True)
# Resample data by year and calculate mean for each pollutant
state_data = state_data[VALUE_COLS].resample('Y').mean()
# Plot each pollutant in a subplot
state_data['so2'].plot(legend=True, ax=ax[0][0], title="SO2")
ax[0][0].set_ylabel("SO2 (μg/m3)")
ax[0][0].set_xlabel("Year")
state_data['no2'].plot(legend=True, ax=ax[0][1], title="NO2")
ax[0][1].set_ylabel("NO2 (μg/m3)")
ax[0][1].set_xlabel("Year")
state_data['rspm'].plot(legend=True, ax=ax[1][0], title="RSPM")
ax[1][0].set_ylabel("RSPM (PM10 μg/m3)")
ax[1][0].set_xlabel("Year")
state_data['spm'].plot(legend=True, ax=ax[1][1], title="SPM")
ax[1][1].set_ylabel("SPM (PM10 μg/m3)")
ax[1][1].set_xlabel("Year")
# Plotting for Uttar Pradesh
plot_for_state("Uttar Pradesh")
# Defining a function to find and plot the top 10 and bottom 10 states for a given indicator
def top_and_bottom_10_states(indicator="so2"):
fig, ax = plt.subplots(2, 1, figsize=(20, 12))
ind = aqi[[indicator, 'state']].groupby('state', as_index=False).median()
top10 = sns.barplot(x='state', y=indicator, data=ind.nlargest(10, indicator), ax=ax[0])
top10.set_title("Top 10 states by {} (1991-2016)".format(indicator))
top10.set_ylabel("{} (μg/m3)".format(indicator))
top10.set_xlabel("State")
bottom10 = sns.barplot(x='state', y=indicator, data=ind.nsmallest(10, indicator), ax=ax[1])
bottom10.set_title("Bottom 10 states by {} (1991-2016)".format(indicator))
bottom10.set_ylabel("{} (μg/m3)".format(indicator))
bottom10.set_xlabel("State")
# Plotting top and bottom states for SO2 and NO2
top_and_bottom_10_states("so2")
top_and_bottom_10_states("no2")
# Defining a function to find the highest ever recorded levels for a given indicator
def highest_levels_recorded(indicator="so2"):
plt.figure(figsize=(20, 10))
ind = aqi[[indicator, 'location', 'state', 'date']].groupby('state', as_index=False).max()
highest = sns.barplot(x='state', y=indicator, data=ind)
highest.set_title("Highest ever {} levels recorded by state".format(indicator))
plt.xticks(rotation=90)
# Plotting highest levels recorded for NO2 and RSPM
highest_levels_recorded("no2")
highest_levels_recorded("rspm")
# Defining a function to plot the yearly trend values for a given indicator
def yearly_trend(state="", indicator="so2"):
plt.figure(figsize=(20, 12))
aqi['year'] = aqi.date.dt.year
if state == "":
year_wise = aqi[[indicator, 'year', 'state']].groupby('year', as_index=False).mean()
trend = sns.pointplot(x='year', y=indicator, data=year_wise)
trend.set_title('Yearly trend of {}'.format(indicator))
else:
year_wise = aqi[[indicator, 'year', 'state']].groupby(['state', 'year'], as_index=False).mean()
trend = sns.pointplot(x='year', y=indicator, data=year_wise)
trend.set_title('Yearly trend of {} for {}'.format(indicator, state))
# Plotting yearly trends for all states and West Bengal for NO2
yearly_trend()
yearly_trend("West Bengal", "no2")
# Defining a function to plot a heatmap for yearly median average for a given indicator
def indicator_by_state_and_year(indicator="so2"):
plt.figure(figsize=(20, 20))
hmap = sns.heatmap(
data=aqi.pivot_table(values=indicator, index='state', columns='year', aggfunc='mean'),
annot=True, linewidths=.5, cbar=True, square=True, cmap='inferno'
)
hmap.set_title("{} by state and year".format(indicator))
# Plotting heatmap for NO2
indicator_by_state_and_year('no2')
# Defining a function to plot pollutant averages by type for a given indicator
def type_avg(indicator=""):
type_avg = aqi[VALUE_COLS + ['type', 'date']].groupby("type").mean()
if indicator != "":
t = type_avg[indicator].plot(kind='bar')
plt.xticks(rotation=0)
plt.title("Pollutant average by type for {}".format(indicator))
else:
t = type_avg.plot(kind='bar')
plt.xticks(rotation=0)
plt.title("Pollutant average by type")
# Plotting pollutant average by type for SO2
type_avg('so2')
# Defining a function to plot pollutant averages for a given indicator (default SO2)
def location_avgs(state, indicator="so2"):
locs = aqi[VALUE_COLS + ['state', 'location', 'date']].groupby(['state', 'location']).mean()
state_avgs = locs.loc[state].reset_index()
sns.barplot(x='location', y=indicator, data=state_avgs)
plt.title("Location-wise average for {} in {}".format(indicator, state))
plt.xticks(rotation=90)
# Plotting location averages for Uttar Pradesh and NO2
location_avgs("Uttar Pradesh", "no2")
stn_code 144077 sampling_date 3 state 0 location 3 agency 149481 type 5393 so2 34646 no2 16233 rspm 40222 spm 237387 location_monitoring_station 27491 pm2_5 426428 date 7 dtype: int64 (435742, 13) <class 'pandas.core.frame.DataFrame'> RangeIndex: 435742 entries, 0 to 435741 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 stn_code 291665 non-null object 1 sampling_date 435739 non-null object 2 state 435742 non-null object 3 location 435739 non-null object 4 agency 286261 non-null object 5 type 430349 non-null object 6 so2 401096 non-null float64 7 no2 419509 non-null float64 8 rspm 395520 non-null float64 9 spm 198355 non-null float64 10 location_monitoring_station 408251 non-null object 11 pm2_5 9314 non-null float64 12 date 435735 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(5), object(7) memory usage: 43.2+ MB state 0 location 0 type 0 so2 0 no2 0 rspm 0 spm 0 pm2_5 0 date 0 dtype: int64
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1874, in GroupBy._agg_py_fallback(self, how, values, ndim, alt) 1873 try: -> 1874 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 1875 except Exception as err: File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\ops.py:849, in BaseGrouper.agg_series(self, obj, func, preserve_dtype) 847 preserve_dtype = True --> 849 result = self._aggregate_series_pure_python(obj, func) 851 if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\ops.py:877, in BaseGrouper._aggregate_series_pure_python(self, obj, func) 876 for i, group in enumerate(splitter): --> 877 res = func(group) 878 res = extract_result(res) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:2380, in GroupBy.mean.<locals>.<lambda>(x) 2377 else: 2378 result = self._cython_agg_general( 2379 "mean", -> 2380 alt=lambda x: Series(x).mean(numeric_only=numeric_only), 2381 numeric_only=numeric_only, 2382 ) 2383 return result.__finalize__(self.obj, method="groupby") File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\series.py:6225, in Series.mean(self, axis, skipna, numeric_only, **kwargs) 6217 @doc(make_doc("mean", ndim=1)) 6218 def mean( 6219 self, (...) 6223 **kwargs, 6224 ): -> 6225 return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py:11992, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs) 11985 def mean( 11986 self, 11987 axis: Axis | None = 0, (...) 11990 **kwargs, 11991 ) -> Series | float: > 11992 return self._stat_function( 11993 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs 11994 ) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py:11949, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs) 11947 validate_bool_kwarg(skipna, "skipna", none_allowed=False) > 11949 return self._reduce( 11950 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only 11951 ) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\series.py:6133, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds) 6129 raise TypeError( 6130 f"Series.{name} does not allow {kwd_name}={numeric_only} " 6131 "with non-numeric dtypes." 6132 ) -> 6133 return op(delegate, skipna=skipna, **kwds) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds) 146 else: --> 147 result = alt(values, axis=axis, skipna=skipna, **kwds) 149 return result File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs) 402 mask = isna(values) --> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs) 406 if datetimelike: File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:720, in nanmean(values, axis, skipna, mask) 719 the_sum = values.sum(axis, dtype=dtype_sum) --> 720 the_sum = _ensure_numeric(the_sum) 722 if axis is not None and getattr(the_sum, "ndim", False): File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:1693, in _ensure_numeric(x) 1691 if isinstance(x, str): 1692 # GH#44008, GH#36703 avoid casting e.g. strings to numeric -> 1693 raise TypeError(f"Could not convert string '{x}' to numeric") 1694 try: TypeError: Could not convert string 'BiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKeralaMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraOdishaOdishaOdishaPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest Bengal' to numeric The above exception was the direct cause of the following exception: TypeError Traceback (most recent call last) Cell In[18], line 143 140 trend.set_title('Yearly trend of {} for {}'.format(indicator, state)) 142 # Plotting yearly trends for all states and West Bengal for NO2 --> 143 yearly_trend() 144 yearly_trend("West Bengal", "no2") 146 # Defining a function to plot a heatmap for yearly median average for a given indicator Cell In[18], line 134, in yearly_trend(state, indicator) 131 aqi['year'] = aqi.date.dt.year 133 if state == "": --> 134 year_wise = aqi[[indicator, 'year', 'state']].groupby('year', as_index=False).mean() 135 trend = sns.pointplot(x='year', y=indicator, data=year_wise) 136 trend.set_title('Yearly trend of {}'.format(indicator)) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:2378, in GroupBy.mean(self, numeric_only, engine, engine_kwargs) 2371 return self._numba_agg_general( 2372 grouped_mean, 2373 executor.float_dtype_mapping, 2374 engine_kwargs, 2375 min_periods=0, 2376 ) 2377 else: -> 2378 result = self._cython_agg_general( 2379 "mean", 2380 alt=lambda x: Series(x).mean(numeric_only=numeric_only), 2381 numeric_only=numeric_only, 2382 ) 2383 return result.__finalize__(self.obj, method="groupby") File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1929, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs) 1926 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) 1927 return result -> 1929 new_mgr = data.grouped_reduce(array_func) 1930 res = self._wrap_agged_manager(new_mgr) 1931 out = self._wrap_aggregated_output(res) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\internals\managers.py:1428, in BlockManager.grouped_reduce(self, func) 1424 if blk.is_object: 1425 # split on object-dtype blocks bc some columns may raise 1426 # while others do not. 1427 for sb in blk._split(): -> 1428 applied = sb.apply(func) 1429 result_blocks = extend_blocks(applied, result_blocks) 1430 else: File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\internals\blocks.py:366, in Block.apply(self, func, **kwargs) 360 @final 361 def apply(self, func, **kwargs) -> list[Block]: 362 """ 363 apply the function to my values; return a block if we are not 364 one 365 """ --> 366 result = func(self.values, **kwargs) 368 result = maybe_coerce_values(result) 369 return self._split_op_result(result) File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1926, in GroupBy._cython_agg_general.<locals>.array_func(values) 1923 else: 1924 return result -> 1926 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) 1927 return result File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1878, in GroupBy._agg_py_fallback(self, how, values, ndim, alt) 1876 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" 1877 # preserve the kind of exception that raised -> 1878 raise type(err)(msg) from err 1880 if ser.dtype == object: 1881 res_values = res_values.astype(object, copy=False) TypeError: agg function failed [how->mean,dtype->object]
<Figure size 2000x1200 with 0 Axes>
In [ ]:
In [ ]:
''' Pract 12 -
Data Aggregation
Problem Statement: Analyzing Sales Performance by Region in a Retail Company
Dataset: "Retail_Sales_Data.csv"
Description: The dataset contains information about sales transactions in a retail company. It
includes attributes such as transaction date, product category, quantity sold, and sales
amount. The goal is to perform data aggregation to analyze the sales performance by region
and identify the top-performing regions.
Tasks to Perform:
1. Import the "Retail_Sales_Data.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Identify the relevant variables for aggregating sales data, such as region, sales
amount, and product category.
4. Group the sales data by region and calculate the total sales amount for each region.
5. Create bar plots or pie charts to visualize the sales distribution by region.
6. Identify the top-performing regions based on the highest sales amount.
7. Group the sales data by region and product category to calculate the total sales
amount for each combination.
8. Create stacked bar plots or grouped bar plots to compare the sales amounts across
different regions and product categories.
In [19]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("customer_shopping_data.csv")
df.head()
df.tail()
# To check the count of records grouped by region/branch of the mall
df.groupby("shopping_mall").count()
# To check the count of records grouped by the product categories
df.groupby("category").count()
# total sales for each mall branch
branch_sales = df.groupby("shopping_mall").sum()
branch_sales
# total sales for each category of product
category_sales = df.groupby("category").sum()
category_sales
# to get the top performing branches
branch_sales.sort_values(by = "price", ascending = False)
# to get the top selling categories
category_sales.sort_values(by = "price", ascending = False)
# to get total sales for each combination of branch and product_category
combined_branch_category_sales = df.groupby(["shopping_mall", "category"]).sum()
combined_branch_category_sales
# pie chart for sales by branch
plt.pie(branch_sales["price"], labels = branch_sales.index)
plt.show()
# pie chart for sales by product category
plt.pie(category_sales["price"], labels = category_sales.index)
plt.show()
combined_pivot = df.pivot_table(index="shopping_mall", columns="category", values="price", aggfunc="sum")
# grouped bar chart for sales of different categories at different branches
combined_pivot.plot(kind="bar", figsize=(10, 6))
plt.show()
In [ ]:
In [ ]:
In [ ]: